In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Step 2: Load the dataset
data = pd.read_csv('train_dataset.csv', index_col=0)

In [None]:
# Step 3: Basic Data Overview
print(data.head())
print(data.info())

In [None]:
# Step 4: Handle missing values (if any)
# Checking for missing values
print("Missing values:\n", data.isnull().sum())

In [5]:
# Step 5: Preprocessing the data
# Define categorical and numerical columns
categorical_cols = ['Gender', 'Category', 'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Payment_Method', 'Referral_Source']
numerical_cols = ['Age', 'Reviews_Read', 'Price', 'Discount', 'Items_In_Cart', 'Socioeconomic_Status_Score', 'Engagement_Score']

In [6]:
# Step 6: Splitting the data into features and target
X = data.drop(columns='Purchase')
y = data['Purchase']

In [7]:
# Step 7: Preprocessing pipeline
# Handle categorical variables (one-hot encoding), numerical variables (imputation and scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                ('scaler', StandardScaler())]), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [8]:
# Step 8: Define the model pipeline with Random Forest
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

In [9]:
# Step 9: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 10: Train the model
model.fit(X_train, y_train)

In [None]:
# Step 11: Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [12]:
# Step 12: Load the evaluation dataset (without the 'Purchase' column)
evaluation_data = pd.read_csv('test_dataset.csv', index_col=0)

In [13]:
# Step 13: Apply the model to the evaluation dataset
evaluation_predictions = model.predict(evaluation_data)

In [14]:
# Step 14: Create a DataFrame with 'id' and 'Purchase' predictions
predictions_df = pd.DataFrame({
    'id': evaluation_data.index,
    'Purchase': evaluation_predictions
})

In [None]:
# Step 15: Save the predictions to a CSV file
predictions_df.to_csv('purchase_predictions_eval.csv', index=False)
print("Predictions saved to 'purchase_predictions_eval.csv'")