## Loading the dataset

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
is_dataset_local = True
if is_dataset_local:
    dataset = pd.read_csv('hotel_bookings.csv')
else:
    dataset = pd.read_csv('https://raw.githubusercontent.com/Sid-darthvader/DoWhy-The-Causal-Story-Behind-Hotel-Booking-Cancellations/master/hotel_bookings.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

## EDA

#### Detect and clean missing data

In [None]:
dataset_not_processed = dataset.copy()

In [None]:
dataset.isnull().mean()

In [None]:
# Company variable is almost missing, meaning most of the ID of the company/entity that made the booking or responsible for paying the booking are missing.
# We've decided to drop this column as it is not informative to our experiment.

dataset = dataset.drop('company', axis=1)

In [None]:
# Let us observe the agent distribution now as it has 14% nan values

sns.histplot(dataset['agent'], kde=True, bins=50, linewidth=3, color="black")

plt.xlabel('Agent Fees')
plt.ylabel('Density')

plt.show()

In [None]:
#The Agent variable has approximately 14% missing values. We will impute these missing values with the most frequent value, as it is significantly predominant.

dataset['agent']=dataset['agent'].replace(np.nan, 0)

In [None]:
# Let us observe the distribution after the imuptation and ensure that it preserves the same pattern.

sns.histplot(dataset['agent'], kde=True, bins=50, linewidth=3, color="black")

plt.xlabel('Agent Fees')
plt.ylabel('Density')

plt.show()

In [None]:
# Now lets handle the country distribution

country_distribution = dataset['country'].value_counts(normalize=True) * 100

plt.figure(figsize=(30, 6))
country_distribution.plot(kind='bar', color='skyblue')

plt.title('Country Distribution')
plt.xlabel('Country')
plt.ylabel('Percentage')
plt.xticks(rotation=90, ha='right', fontsize=12, color='black', backgroundcolor='white')
plt.tight_layout()

plt.show()

In [None]:
# We can see that the most frequent country is Portugal and by far. We will impute the nan values with PRT.
# We only have 0.4% nan values in the country column.

dataset['country'] = dataset['country'].replace(np.nan, dataset['country'].mode()[0])

In [None]:
# We still have 0.000034 nan values in the "children" column, as the percentage is neglectable we are going to delete sush rows.

dataset.dropna(inplace=True)

In [None]:
assert not dataset.isnull().values.any(), "There are stillx NaN or null values in the dataset."

#### Merge and remove unnecessary columns

In [None]:
# Our preprocessing will simplify certain assumptions. We believe that adults, children, and babies within each family can be grouped together, as they have the same influence in our experiment.
# Similarly, the stays_in_week_nights and stays_in_weekend_nights variables can be combined, as they exert the same influence on our analysis and don't have any meaning as their own.

dataset['total_guests'] = dataset['adults'] + dataset['babies'] + dataset['children']
dataset.drop(['adults', 'babies', 'children'], axis=1, inplace=True)
dataset['total_days'] = dataset['stays_in_week_nights'] + dataset['stays_in_weekend_nights']
dataset.drop(['stays_in_week_nights', 'stays_in_weekend_nights'], axis=1, inplace=True)

In [None]:
# We are going to merge the reserved_room_type and assigned_room_type in one column that is true when the customer get the reserved as the assigned, and false otherwise.

dataset["different_room_assigned"]= np.where(dataset["reserved_room_type"] == dataset["assigned_room_type"], 1, 0)
dataset.drop(['assigned_room_type', 'reserved_room_type'], axis=1, inplace=True)


In [None]:
# dropping some additional variables that we believe has no strong causal effect in our experiment, and just complicates the causal relations.

dataset.drop(['reservation_status_date'], axis=1, inplace=True)
dataset.drop(['arrival_date_year'], axis=1, inplace=True)
dataset.drop(['arrival_date_week_number'], axis=1, inplace=True)
dataset.drop(['arrival_date_day_of_month'], axis=1, inplace=True)
dataset.drop(['market_segment'], axis=1, inplace=True)
dataset.drop(['customer_type'], axis=1, inplace=True)
dataset.drop(['meal'], axis=1, inplace=True)
dataset.drop(['reservation_status'], axis=1, inplace=True)

#### Transform the arrival date month from categorical to numerical

In [None]:
from sklearn.preprocessing import LabelEncoder

month_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

dataset['arrival_date_month'] = dataset['arrival_date_month'].map(month_mapping)

le = LabelEncoder()

categorical_columns = ['hotel', 'deposit_type', 'distribution_channel', 'country']

for col in categorical_columns:
    dataset[col] = le.fit_transform(dataset[col])

print(dataset)

#### Detect outliers

In [None]:
# Detect outliers:

features = dataset.columns.to_list()
n = 1
plt.figure(figsize=(10,10))
for feature in features:
    plt.subplot(4, 5, n)
    sns.boxplot(dataset[feature])
    n += 1
    plt.tight_layout()
plt.savefig("boxplot_outlier_detection.png")

In [None]:
dataset = dataset[dataset['lead_time'] <= 600]
dataset = dataset[dataset['adr'] <= 3000]

#### Save preprocessed dataset

In [None]:
# Saving the preprocessed dataset
dataset.to_csv("dataset_ready.csv", index=False)

In [None]:
dataset["different_room_assigned"].value_counts()

### EDA

#### Correlation matrix

In [None]:
# Lets observe first the correlation map between the numerical values.
# We can observe a correlation between is_canceled and different_room_assigned, indicating that assigning a different room type may have an impact on booking cancellations.

categorical_features = []
numerical_features = []

for col in dataset.columns:
    if(dataset[col].dtype != 'object'):
        numerical_features.append(col)
    else:
        categorical_features.append(col)

print(categorical_features)

plt.figure(figsize=(15,10))
sns.heatmap(dataset[numerical_features].corr(), linewidths=2, linecolor='black', annot=True, fmt=".3f")
plt.savefig("correlation_matrix.png")

#### Pairplot

In [None]:
# Observing the scatter plot between each two numerical values

pairplot_fig = sns.pairplot(dataset[numerical_features])
pairplot_fig.savefig("pairplot.png")
plt.show()

#### Booking cancellation ratio

In [None]:
# Booking cancellation ratio

dataset['is_canceled'].value_counts().plot.pie(autopct='%1.1f%%',shadow=True, colors=['green', 'red'])

#### Effect of lead time on cancellation rate

In [None]:
custom_palette = ['#FF9999', '#66B2FF']

sns.boxplot(x='is_canceled', y='lead_time',data=dataset, palette=custom_palette)
plt.savefig("LeadTimeEffectOnCancellationRate.png")

#### Cancellation rate for repeated guests vs non repeated guests

In [None]:
def annotate_bars(ax, data, category_counts):
    for p, count in zip(ax.patches, category_counts):
        height = p.get_height()
        if height > 0:
            percentage = f'{100 * height / count:.1f}%'
            x = p.get_x() + p.get_width() / 2
            y = height
            ax.annotate(percentage, (x, y), ha='center', va='bottom', fontsize=12)

def display_graph(ax, feat):
    category_counts = feat.value_counts().values
    annotate_bars(ax, feat, category_counts)

plt.figure(figsize=(10, 5))
ax = sns.countplot(x='is_repeated_guest', hue='is_canceled', data=dataset, palette=custom_palette)
plt.title("Repeated Guest vs Cancellation", fontweight="bold", size=20)
display_graph(ax, dataset.is_repeated_guest)
plt.savefig("RepeatedGuestVsCancellation.png")
plt.show()

#### Cancellation rate for Deposit vs Non-Deposit

In [None]:
plt.figure(figsize=(10, 5))
ax = sns.countplot(x='deposit_type', hue="is_canceled", data=dataset_not_processed, palette=custom_palette)
plt.title("Cancellation Count vs Deposit Type", fontweight="bold", size=20)
display_graph(ax, dataset_not_processed.deposit_type)
plt.savefig("DepositTypeVsCancellation.png")

#### Effect of the amount of special requests affect the cancellation rate

In [None]:
plt.figure(figsize=(10, 5))
ax = sns.countplot(x='total_of_special_requests', hue="is_canceled", data=dataset, palette=custom_palette)
plt.title("Cancellation count vs number of special requests", fontweight="bold", size=20)
display_graph(ax, dataset.total_of_special_requests)
plt.savefig("SpecialRequestsVsCancellation.png")

## Framing, Identification

#### Load the preprocessed dataset

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Load the ready dataset
dataset=pd.read_csv('dataset_ready.csv')

In [None]:
measured_variables = dataset.columns.tolist()
print(measured_variables)
print(f'number of measured variables: {len(measured_variables)}')

### Build the Causal Graph representing our experiment

In [None]:
from causalgraphicalmodels import CausalGraphicalModel

hidden_confounders = ['financial_status_hidden', 'hotel_policies_hidden', 'local_events_hidden']
all_variables = measured_variables + hidden_confounders

graph = CausalGraphicalModel(
    nodes=all_variables,
    edges=[
        ('distribution_channel', 'lead_time'),
        ('financial_status_hidden', 'distribution_channel'),
        ('distribution_channel', 'is_canceled'),
        ('distribution_channel', 'days_in_waiting_list'),
        ('distribution_channel', 'different_room_assigned'),
        ('hotel', 'booking_changes'),
        ('hotel', 'total_of_special_requests'),
        ('hotel', 'local_events_hidden'),
        ('hotel', 'total_days'),
        ('hotel', 'arrival_date_month'),
        ('lead_time', 'is_canceled'), 
        ('country', 'lead_time'),
        ('lead_time', 'days_in_waiting_list'),
        ('lead_time', 'deposit_type'),
        ('local_events_hidden', 'lead_time'),
        ('local_events_hidden', 'booking_changes'),
        ('local_events_hidden', 'is_canceled'),
        ('local_events_hidden', 'deposit_type'),
        ('local_events_hidden', 'total_days'),
        ('country', 'local_events_hidden'),
        ('country', 'hotel_policies_hidden'),
        ('different_room_assigned', 'is_canceled'), 
        ('total_of_special_requests', 'different_room_assigned'),
        ('hotel_policies_hidden', 'different_room_assigned'),
        ('total_guests', 'different_room_assigned'),
        ('deposit_type', 'different_room_assigned'),
        ('is_repeated_guest', 'is_canceled'),
        ('days_in_waiting_list', 'is_canceled'),
        ('hotel_policies_hidden', 'days_in_waiting_list'),
        ('previous_bookings_not_canceled', 'is_canceled'),
        ('previous_bookings_not_canceled', 'is_repeated_guest'),
        ('financial_status_hidden', 'deposit_type'),
        ('financial_status_hidden', 'is_canceled'),
        ('arrival_date_month', 'total_days'),
        ('is_repeated_guest', 'is_canceled'),
        ('total_days', 'is_canceled'),
        ('total_days', 'agent'),
        ('agent', 'is_canceled'),
        ('financial_status_hidden', 'agent'),
        ('country', 'agent'),
        ('agent', 'days_in_waiting_list'),
        ('total_guests', 'is_canceled'),
        ('previous_cancellations', 'is_canceled'),
        ('previous_cancellations', 'is_repeated_guest'),
        ('total_guests', 'required_car_parking_spaces'),
        ('total_days', 'required_car_parking_spaces'),
        ('total_of_special_requests', 'is_canceled'),
        ('booking_changes', 'different_room_assigned'),
        ('booking_changes', 'is_canceled'),
        ('is_canceled', 'adr'),
        ('total_of_special_requests', 'days_in_waiting_list')        
    ]
)

graph.draw()

#### Ensure graph is DAG

In [None]:
# Ensure Causal Graph is DAG

from collections import defaultdict

class Graph:
    def __init__(self, edges):
        self.graph = defaultdict(list)
        for u, v in edges:
            self.graph[u].append(v)
    
    def find_cycle(self):
        visited = set()
        rec_stack = set()
        
        nodes = list(self.graph.keys())
        
        for node in nodes:
            if node not in visited:
                if self._dfs(node, visited, rec_stack):
                    return self.cycle
        
        return None
    
    def _dfs(self, node, visited, rec_stack):
        visited.add(node)
        rec_stack.add(node)
        
        for neighbor in self.graph[node]:
            if neighbor not in visited:
                if self._dfs(neighbor, visited, rec_stack):
                    return True
            elif neighbor in rec_stack:
                # Cycle detected, store the cycle
                self.cycle = list(rec_stack)
                return True
        
        rec_stack.remove(node)
        return False

edges = [
    ('distribution_channel', 'lead_time'),
    ('financial_status_hidden', 'distribution_channel'),
    ('distribution_channel', 'is_canceled'),
    ('distribution_channel', 'days_in_waiting_list'),
    ('distribution_channel', 'different_room_assigned'),
    ('hotel', 'booking_changes'),
    ('hotel', 'total_of_special_requests'),
    ('hotel', 'local_events_hidden'),
    ('hotel', 'total_days'),
    ('hotel', 'arrival_date_month'),
    ('lead_time', 'is_canceled'), 
    ('country', 'lead_time'),
    ('lead_time', 'days_in_waiting_list'),
    ('lead_time', 'deposit_type'),
    ('local_events_hidden', 'lead_time'),
    ('local_events_hidden', 'booking_changes'),
    ('local_events_hidden', 'is_canceled'),
    ('local_events_hidden', 'deposit_type'),
    ('local_events_hidden', 'total_days'),
    ('country', 'local_events_hidden'),
    ('country', 'hotel_policies_hidden'),
    ('different_room_assigned', 'is_canceled'), 
    ('total_of_special_requests', 'different_room_assigned'),
    ('hotel_policies_hidden', 'different_room_assigned'),
    ('total_guests', 'different_room_assigned'),
    ('deposit_type', 'different_room_assigned'),
    ('is_repeated_guest', 'is_canceled'),
    ('days_in_waiting_list', 'is_canceled'),
    ('hotel_policies_hidden', 'days_in_waiting_list'),
    ('previous_bookings_not_canceled', 'is_canceled'),
    ('previous_bookings_not_canceled', 'is_repeated_guest'),
    ('financial_status_hidden', 'deposit_type'),
    ('financial_status_hidden', 'is_canceled'),
    ('arrival_date_month', 'total_days'),
    ('is_repeated_guest', 'is_canceled'),
    ('total_days', 'is_canceled'),
    ('total_days', 'agent'),
    ('agent', 'is_canceled'),
    ('financial_status_hidden', 'agent'),
    ('country', 'agent'),
    ('agent', 'days_in_waiting_list'),
    ('total_guests', 'is_canceled'),
    ('previous_cancellations', 'is_canceled'),
    ('previous_cancellations', 'is_repeated_guest'),
    ('total_guests', 'required_car_parking_spaces'),
    ('total_days', 'required_car_parking_spaces'),
    ('total_of_special_requests', 'is_canceled'),
    ('booking_changes', 'different_room_assigned'),
    ('booking_changes', 'is_canceled'),
    ('is_canceled', 'adr'),
    ('total_of_special_requests', 'days_in_waiting_list')  
]

graph = Graph(edges)
cycle = graph.find_cycle()

if cycle:
    print("Cycle detected:", cycle)
else:
    print("No cycle detected")

### Framing

In [None]:
import dowhy
from dowhy import CausalModel
import json


graph_str = """
digraph {
    "distribution_channel" -> "lead_time";
    "financial_status_hidden" -> "distribution_channel";
    "distribution_channel" -> "is_canceled";
    "distribution_channel" -> "days_in_waiting_list";
    "distribution_channel" -> "different_room_assigned";
    "hotel" -> "booking_changes";
    "hotel" -> "total_of_special_requests";
    "hotel" -> "local_events_hidden";
    "hotel" -> "total_days";
    "hotel" -> "arrival_date_month";
    "lead_time" -> "is_canceled"; 
    "country" -> "lead_time";
    "lead_time" -> "days_in_waiting_list";
    "lead_time" -> "deposit_type";
    "local_events_hidden" -> "lead_time";
    "local_events_hidden" -> "booking_changes";
    "local_events_hidden" -> "is_canceled";
    "local_events_hidden" -> "deposit_type";
    "local_events_hidden" -> "total_days";
    "country" -> "local_events_hidden";
    "country" -> "hotel_policies_hidden";
    "different_room_assigned" -> "is_canceled"; 
    "total_of_special_requests" -> "different_room_assigned";
    "hotel_policies_hidden" -> "different_room_assigned";
    "total_guests" -> "different_room_assigned";
    "deposit_type" -> "different_room_assigned";
    "is_repeated_guest" -> "is_canceled";
    "days_in_waiting_list" -> "is_canceled";
    "hotel_policies_hidden" -> "days_in_waiting_list";
    "previous_bookings_not_canceled" -> "is_canceled";
    "previous_bookings_not_canceled" -> "is_repeated_guest";
    "financial_status_hidden" -> "deposit_type";
    "financial_status_hidden" -> "is_canceled";
    "arrival_date_month" -> "total_days";
    "is_repeated_guest" -> "is_canceled";
    "total_days" -> "is_canceled";
    "total_days" -> "agent";
    "agent" -> "is_canceled";
    "financial_status_hidden" -> "agent";
    "country" -> "agent";
    "agent" -> "days_in_waiting_list";
    "total_guests" -> "is_canceled";
    "previous_cancellations" -> "is_canceled";
    "previous_cancellations" -> "is_repeated_guest";
    "total_guests" -> "required_car_parking_spaces";
    "total_days" -> "required_car_parking_spaces";
    "total_of_special_requests" -> "is_canceled";
    "booking_changes" -> "different_room_assigned";
    "booking_changes" -> "is_canceled";
    "is_canceled" -> "adr";
    "total_of_special_requests" -> "days_in_waiting_list";
}
"""

dataset_modeling=dataset.copy()
model = CausalModel(
    data=dataset_modeling,
    treatment='different_room_assigned',
    outcome='is_canceled',
    graph=graph_str
)

### Identification

In [None]:
# Identification
identified_estimand = model.identify_effect()
print(f"Identified effects: {identified_estimand}")
print(f"All identified back-door variables: {json.dumps(identified_estimand.backdoor_variables, indent=4)}")

## Estimation, Validation

### Estimate ATE and Validate

#### Choosing backdoor criterion based on the identification and ate effect

In [None]:
# Choosing backdoor criterion based on the identification
from pathlib import Path

identified_estimand.default_backdoor_id = 'backdoor'
desired_effect="ate"

#### Estimate ATE using the relevant group of variables identified through the backdoor paths

##### Estimate ATE using Linear model, propensity score weighting, propensity score matching, propensity score stratification

In [None]:
import warnings
import sys
import io
import time

estimated_regular_ate = False

if not estimated_regular_ate:
    warnings.filterwarnings("ignore", category=FutureWarning)
    methods = ['backdoor.linear_regression', 'backdoor.propensity_score_weighting', 'backdoor.propensity_score_matching', 'backdoor.propensity_score_stratification']
    methods = ['backdoor.linear_regression'] # DELETE

    for method in methods:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name=method,
                target_units=desired_effect,
                confidence_intervals=True,
                test_significance=True,
            )

            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")

            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)
            
            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATE/Regular")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{method}_ATE.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATE using {method}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')
            
        except AttributeError as e:
            print(f"Error encountered with {method}: {e}")

##### Estimate ATE using S-Learner, by fitting {LinearRegression, Ridge, Lasso, ElasticNet, DecisionTreeRegressor, RandomForestRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier} to predict the outcome 

In [None]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
import warnings
import time
import io
import sys

estimated_Slearner_ate = False 

if not estimated_Slearner_ate:
    warnings.filterwarnings(action='ignore', category=UserWarning)

    ml_models=[LinearRegression, Ridge, DecisionTreeRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier]

    for ml_model in ml_models:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name="backdoor.econml.metalearners.SLearner",
                method_params={
                    "init_params": {'overall_model': ml_model()},
                    "fit_params": {}
                },
                target_units=desired_effect,
                test_significance=True,
            )
            
            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")
            
            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)

            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATE/SLearner")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{ml_model.__name__}_ATE.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATE using {ml_model.__name__}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

        except AttributeError as e:
            print(f"Error encountered with {ml_model.__name__}: {e}")

##### Estimate ATE using T-Learner, by fitting {LinearRegression, Ridge, Lasso, ElasticNet, DecisionTreeRegressor, RandomForestRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier} to predict the outcome for both models f_0 and f_1

In [None]:
import warnings
import time

estimated_Tlearner_ate = False

if not estimated_Tlearner_ate:
    warnings.filterwarnings(action='ignore', category=UserWarning)

    ml_models=[LinearRegression, Ridge, DecisionTreeRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier]

    # Estimate the effect using a custom econml model
    for ml_model in ml_models:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name="backdoor.econml.metalearners.TLearner",
                method_params={
                    "init_params": {'models': [ml_model(), ml_model()]},
                    "fit_params": {}
                },
                target_units=desired_effect,
                test_significance=True,
            )

            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")
            
            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)

            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATE/TLearner")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{ml_model.__name__}_ATE.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')
            
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATE using {ml_model.__name__}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

        except AttributeError as e:
            print(f"Error encountered with {ml_model.__name__}: {e}")

### Estimate ATT and Validate

#### Choosing backdoor criterion based on the identification and att effect

In [None]:
# Choosing backdoor criterion based on the identification

identified_estimand.default_backdoor_id = 'backdoor'
desired_effect="att"

#### Estimate ATT using the relevant group of variables identified through the backdoor paths

##### Estimate ATT using Linear model, propensity score weighting, propensity score matching, propensity score stratification

In [None]:
import warnings
import sys
import io
import time

estimated_regular_att = False

if not estimated_regular_att:
    warnings.filterwarnings("ignore", category=FutureWarning)
    methods = ['backdoor.linear_regression', 'backdoor.propensity_score_weighting', 'backdoor.propensity_score_matching', 'backdoor.propensity_score_stratification']

    for method in methods:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name=method,
                target_units=desired_effect,
                confidence_intervals=True,
                test_significance=True,
            )

            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")
            
            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)

            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATT/Regular")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{method}_ATT.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATT using {method}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')
            
        except AttributeError as e:
            print(f"Error encountered with {method}: {e}")

##### Estimate ATT using S-Learner, by fitting {LinearRegression, Ridge, Lasso, ElasticNet, DecisionTreeRegressor, RandomForestRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier} to predict the outcome 

In [None]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
import warnings
import time
import io
import sys

estimated_Slearner_att = False 

if not estimated_Slearner_att:
    warnings.filterwarnings(action='ignore', category=UserWarning)

    ml_models=[LinearRegression, Ridge,DecisionTreeRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier]
    
    for ml_model in ml_models:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name="backdoor.econml.metalearners.SLearner",
                method_params={
                    "init_params": {'overall_model': ml_model()},
                    "fit_params": {}
                },
                target_units=desired_effect,
                test_significance=True,
            )
            
            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")
            
            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)

            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATT/SLearner")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{ml_model.__name__}_ATT.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATT using {ml_model.__name__}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

        except AttributeError as e:
            print(f"Error encountered with {ml_model.__name__}: {e}")

##### Estimate ATT using T-Learner, by fitting {LinearRegression, Ridge, Lasso, ElasticNet, DecisionTreeRegressor, RandomForestRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier} to predict the outcome for both models f_0 and f_1

In [None]:
import warnings
import time

estimated_Tlearner_att = False

if not estimated_Tlearner_att:
    warnings.filterwarnings(action='ignore', category=UserWarning)

    ml_models=[LinearRegression, Ridge, DecisionTreeRegressor, LogisticRegression, MLPRegressor, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, DecisionTreeClassifier]

    # Estimate the effect using a custom econml model
    for ml_model in ml_models:
        start_time = time.time()
        
        try:
            estimate = model.estimate_effect(
                identified_estimand,
                method_name="backdoor.econml.metalearners.TLearner",
                method_params={
                    "init_params": {'models': [ml_model(), ml_model()]},
                    "fit_params": {}
                },
                target_units=desired_effect,
                test_significance=True,
            )

            default_stdout = sys.stdout
            sys.stdout = buffer = io.StringIO()

            estimate.interpret()

            interpretation = buffer.getvalue()
            sys.stdout = default_stdout

            refute_placebo_treatment = model.refute_estimate(
                identified_estimand,
                estimate,
                method_name="placebo_treatment_refuter",
                placebo_type="permute")
            
            end_time = time.time()
            elapsed_time = end_time - start_time
            minutes, seconds = divmod(elapsed_time, 60)

            save_path = Path("/Users/shadi-omari/Repos/CausalInference/Results/ATT/TLearner")
            save_path.mkdir(parents=True, exist_ok=True)
            file_path = save_path / f"{ml_model.__name__}_ATT.txt"
            with open(file_path, "w") as file:
                file.write(str(estimate))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(interpretation)
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write('refute_placebo_treatment: ' + str(refute_placebo_treatment))
                file.write('\n\n\n -----------------------------------------------------------------------\n')
                file.write(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')
            
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Calculating ATT using {ml_model.__name__}: {estimate.value}')
            print('\n\n\n -----------------------------------------------------------------------\n')
            estimate.interpret()
            print('\n\n\n -----------------------------------------------------------------------\n')
            print('refute_placebo_treatment:' + str(refute_placebo_treatment))
            print('\n\n\n -----------------------------------------------------------------------\n')
            print(f'Time taken to estimate: {int(minutes)} minutes and {seconds:.2f} seconds\n')

        except AttributeError as e:
            print(f"Error encountered with {ml_model.__name__}: {e}")