In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# # connect to google drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Load Dataset
data=pd.read_csv('data_email_campaign.csv')

In [None]:
# drop Email_ID column
data.drop('Email_ID',axis=1,inplace=True)

In [None]:
data.info()

In [None]:
column_names=data.columns
# Using SimpleImputer we impute null values with most_frequent value
imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# convert to dataframe
data=pd.DataFrame(imp_most_frequent.fit_transform(data))
data.columns=column_names
data

In [None]:
data=data.convert_dtypes()

In [None]:
data.isnull().sum()

In [None]:
# visualize outliers
for col in ['Subject_Hotness_Score', 'Total_Links','Total_Images']:
  sns.boxplot(data[col])
  plt.show()

In [None]:
# remove outliers by the help of capping method
for col in ['Subject_Hotness_Score', 'Total_Links','Total_Images']:
  percentile25 = data[col].quantile(0.25)
  percentile75 = data[col].quantile(0.75)
  iqr = percentile75 - percentile25
  upper_limit = percentile75 + 1.5 * iqr
  lower_limit = percentile25 - 1.5 * iqr

  data[col] = np.where(
    data[col] > upper_limit,
    upper_limit,
    np.where(
        data[col] < lower_limit,
        lower_limit,
        data[col]
    )
  )

In [None]:
# visualize outliers after removeing outliers
for col in ['Subject_Hotness_Score', 'Total_Links','Total_Images']:
  sns.boxplot(data[col])
  plt.show()

In [None]:
# Data split
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
CT=ColumnTransformer([
    ('tf1',OneHotEncoder(drop='first'),['Customer_Location']),
    ('tf2',StandardScaler(),['Subject_Hotness_Score','Total_Past_Communications','Word_Count','Total_Links','Total_Images'])
],remainder='passthrough')

In [None]:
CT.fit(X_train)

In [None]:
X_train=CT.transform(X_train)
X_test=CT.transform(X_test)

In [None]:
X_train.shape

In [None]:
# balanced data
st=SMOTE()

In [None]:
X_train_smote, y_train_smote = st.fit_resample(X_train,y_train)

In [None]:
# create functions for metrics
def print_metrics(y_te,y_pr):
  print('Accuracy score:',accuracy_score(y_te,y_pr))
  print('Precision score:',precision_score(y_te,y_pr,average='weighted'))
  print('Recall score:',recall_score(y_te,y_pr,average='weighted'))
  print('F1 score:',f1_score(y_te,y_pr,average='weighted'))
  print('Confusion matrix:\n',confusion_matrix(y_te,y_pr))
  print('Classification report:\n',classification_report(y_te,y_pr,target_names=['acknowledged', 'read', 'ignored']))

def plot_confusion_matrix(y_te,y_pr):
  sns.heatmap(confusion_matrix(y_te,y_pr),annot=True, fmt=".0f")
  plt.title('Confusion Matrix')
  plt.show()

In [None]:
# # ML Model - 3 Implementation with hyperparameter optimization techniques (i.e., GridSearch CV, RandomSearch CV, Bayesian Optimization etc.)

# param_for_XGB={
#         'min_child_weight': [0,1, 5],
#         'gamma': [0.5, 1],
#         'subsample': [0.5,0.6, 0.8],
#         'colsample_bytree': [0.8,0.9],
#         'max_depth': [5,6]
#         }

XGB_for_gc=XGBClassifier()
gc_XGB=GridSearchCV(XGB_for_gc,{
        'min_child_weight': [0,1, 5],
        'gamma': [0.5, 1],
        'subsample': [0.5,0.6, 0.8],
        'colsample_bytree': [0.8,0.9],
        'max_depth': [5,6]
        },n_jobs=-1,verbose=2,cv=5)
# Fit the Algorithm
gc_XGB.fit(X_train_smote, y_train_smote)

In [None]:
# Predict on the model
y_pred_for_gc_XGB=gc_XGB.predict(X_test)
print_metrics(y_test,y_pred_for_gc_XGB)

In [None]:
gc_XGB.best_params_

In [None]:
# Predict on the model
print_metrics(y_test,y_pred_for_gc_XGB)

In [None]:
# Visualizing evaluation Metric Score chart
plot_confusion_matrix(y_test,y_pred_for_gc_XGB)

In [None]:
pickle.dump(CT,open('CT.pickle','wb'))

In [None]:
pickle.dump(gc_XGB,open('model.pickle','wb'))