In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE  # for handling class imbalance

In [None]:
data = pd.read_csv("/content/drive/MyDrive/marketing_campaign.csv",sep="\t")

In [None]:
data.head()


Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [None]:
data.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

In [None]:
# print("Dataset Information:")
# print(data.info())
# print("\nSample Data:")
# print(data.head(10))


In [None]:
# Handle missing values by dropping them (you could also choose to impute them)
data = data.dropna()
print(data.head())  # Display the first few rows
print(data.info())  # Display column information

     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  Complain  \
0             0

In [None]:

# Handle missing values by dropping them (you could also choose to impute them)
data = data.dropna()
print(data.head())  # Display the first few rows
print(data.info())  # Display column information

# Step 3: Identify target and feature columns
# Assuming the target column is named 'Response' (update if the name is different)
target_column = 'Response'  # Change to the actual name if necessary
X = data.drop(columns=[target_column])
y = data[target_column]

# Step 4: Handle categorical variables using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Step 5: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Standardize the data (optional, but recommended for some models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define features (X) and target variable (y)
#X = data.drop(columns=['target_column'])  # replace 'Response' with your target variable column name
#y = data['target_column']  # target variable

# Address class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

# Step 2: Define a function for model training and evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    # Print metrics
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("AUC Score:", auc)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, f1, auc

# Step 3: Implement Gradient Boosting with hyperparameter tuning
print("\n--- Gradient Boosting Classifier Results ---")
gb_model = GradientBoostingClassifier(random_state=42)
gb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7]
}
# Use GridSearchCV for hyperparameter tuning
gb_grid = GridSearchCV(gb_model, gb_param_grid, cv=3, scoring='f1')
gb_grid.fit(X_train, y_train)
best_gb_model = gb_grid.best_estimator_

# Evaluate the best model
gb_accuracy, gb_f1, gb_auc = evaluate_model(best_gb_model, X_train, X_test, y_train, y_test)

# Step 4: Implement AdaBoost with hyperparameter tuning
print("\n--- AdaBoost Classifier Results ---")
ada_model = AdaBoostClassifier(random_state=42)
ada_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2]
}
# Use GridSearchCV for hyperparameter tuning
ada_grid = GridSearchCV(ada_model, ada_param_grid, cv=3, scoring='f1')
ada_grid.fit(X_train, y_train)
best_ada_model = ada_grid.best_estimator_

# Evaluate the best model
ada_accuracy, ada_f1, ada_auc = evaluate_model(best_ada_model, X_train, X_test, y_train, y_test)

# Step 5: Implement XGBoost with hyperparameter tuning
print("\n--- XGBoost Classifier Results ---")
xgb_model = XGBClassifier(random_state=42)
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7]
}
# Use GridSearchCV for hyperparameter tuning
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=3, scoring='f1')
xgb_grid.fit(X_train, y_train)
best_xgb_model = xgb_grid.best_estimator_

# Evaluate the best model
xgb_accuracy, xgb_f1, xgb_auc = evaluate_model(best_xgb_model, X_train, X_test, y_train, y_test)

# Step 6: Summary of results
print("\n--- Model Performance Summary ---")
print("Gradient Boosting - Accuracy:", gb_accuracy, "F1 Score:", gb_f1, "AUC:", gb_auc)
print("AdaBoost - Accuracy:", ada_accuracy, "F1 Score:", ada_f1, "AUC:", ada_auc)
print("XGBoost - Accuracy:", xgb_accuracy, "F1 Score:", xgb_f1, "AUC:", xgb_auc)


     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  Complain  \
0             0



Accuracy: 0.8840707964601769
F1 Score: 0.8792626728110599
AUC Score: 0.9537286823891391
Confusion Matrix:
 [[522  51]
 [ 80 477]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89       573
           1       0.90      0.86      0.88       557

    accuracy                           0.88      1130
   macro avg       0.89      0.88      0.88      1130
weighted avg       0.89      0.88      0.88      1130


--- XGBoost Classifier Results ---
Accuracy: 0.9256637168141593
F1 Score: 0.9234972677595629
AUC Score: 0.9805333358399052
Confusion Matrix:
 [[539  34]
 [ 50 507]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       573
           1       0.94      0.91      0.92       557

    accuracy                           0.93      1130
   macro avg       0.93      0.93      0.93      1130
weighted avg       0.93      0.93      0.93      1130


-