# Step 1: Data Preparation and Splitting


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
data_path = '../data/preprocessed_data.csv'
df = pd.read_csv(data_path)

# Split data into training and hold-out set (80-20 split)
train_df, holdout_df = train_test_split(df, test_size=0.2, random_state=42)

target_variable = 'unfulfilled_requests'


# Step 2: Causal Graph Creation (Ground Truth)


In [4]:
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas

# Create the StructureModel and add edges based on domain knowledge/statistical tests
sm = StructureModel()

# Assuming you add edges based on your domain knowledge or statistical tests
# Example:
sm.add_edge('drivers_movement', 'unfulfilled_requests')
sm.add_edge('order_accuracy', 'unfulfilled_requests')
# Add more edges as appropriate

# Fit the structure from data (this example assumes binary discretization for simplicity)
discretised_data = train_df.apply(pd.qcut, q=2, labels=False, duplicates='drop')
sm = from_pandas(discretised_data, max_iter=1000)

# Optionally, you can visualize the graph
from causalnex.plots import plot_structure

plot_structure(sm)


ModuleNotFoundError: No module named 'causalnex'

# Step 3: Incremental Causal Graphs and Jaccard Similarity


In [None]:
from causalnex.structure.notears import from_pandas

# Initialize the StructureModel for incremental graphs
incremental_sm = StructureModel()

# Example: Train with increasing fractions of the data
fraction_sizes = [0.2, 0.4, 0.6, 0.8]  # Example fractions
for frac in fraction_sizes:
    sample_data = train_df.sample(frac=frac, random_state=42)
    discretised_data = sample_data.apply(pd.qcut, q=2, labels=False, duplicates='drop')
    incremental_sm = from_pandas(discretised_data, max_iter=1000)
    
    # Compare the new graph with ground truth using Jaccard similarity
    ground_truth_edges = set(sm.edges())
    incremental_edges = set(incremental_sm.edges())
    
    intersection = ground_truth_edges.intersection(incremental_edges)
    union = ground_truth_edges.union(incremental_edges)
    
    jaccard_similarity = len(intersection) / len(union)
    
    print(f"Jaccard Similarity with {frac} fraction: {jaccard_similarity}")


# Step 4: Select Variables Directly Affecting the Target Variable


In [6]:
from causalnex.inference import InferenceEngine

# Use the inference engine to identify direct causes of 'unfulfilled_requests'
engine = InferenceEngine(sm)
direct_causes = engine.get_all_backdoor_paths('unfulfilled_requests')

print("Direct causes of 'unfulfilled_requests':", direct_causes)


ModuleNotFoundError: No module named 'causalnex'

# Step 5: Answering Questions using Do-Intervention


In [5]:
# Example 1: Intervention - Drivers move 1km every 30 mins
# Assuming 'drivers_movement' affects 'unfulfilled_requests'
engine.do_intervention('drivers_movement', 1.0)  # Intervene to move 1km

# Example 2: Intervention - 20% of orders known within 5km accuracy
# Assuming 'order_accuracy' affects 'unfulfilled_requests'
engine.do_intervention('order_accuracy', 0.8)  # Intervene with 20% accuracy

# Example 3: Intervention - Changing time requirements for past orders
# Assuming 'order_completion_time' affects 'unfulfilled_requests'
engine.do_intervention('order_completion_time', new_value)  # Specify the new time requirements

# Example 4: Intervention - Increase drivers by 10% cumulative per month
# Assuming 'number_of_drivers' affects 'unfulfilled_requests'
engine.do_intervention('number_of_drivers', 1.1)  # Intervene with 10% increase

# Evaluate the effect on 'unfulfilled_requests' after each intervention


NameError: name 'engine' is not defined

# Step 6: Train Machine Learning Models


In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Train models with all variables
X_train_all = train_df.drop(columns=[target_variable])
y_train_all = train_df[target_variable]

model_all = RandomForestRegressor(random_state=42)
model_all.fit(X_train_all, y_train_all)

# Train models with selected variables from causal graph
selected_variables = list(direct_causes) + ['other_selected_variables']
X_train_selected = train_df[selected_variables]
y_train_selected = train_df[target_variable]

model_selected = XGBRegressor(random_state=42)
model_selected.fit(X_train_selected, y_train_selected)

# Evaluate models on the hold-out set to measure overfitting
X_holdout_all = holdout_df.drop(columns=[target_variable])
y_holdout_all = holdout_df[target_variable]

X_holdout_selected = holdout_df[selected_variables]
y_holdout_selected = holdout_df[target_variable]

y_pred_all = model_all.predict(X_holdout_all)
y_pred_selected = model_selected.predict(X_holdout_selected)

mse_all = mean_squared_error(y_holdout_all, y_pred_all)
mse_selected = mean_squared_error(y_holdout_selected, y_pred_selected)

print(f"MSE (All variables): {mse_all}")
print(f"MSE (Selected variables): {mse_selected}")
