# Introduction

Hello guys, this code is part of Maven analytics challenge. EDA will be performed in PowerBI, but i´d like to create simple predictive model using XGBoost to spice things up!

Link to challenge: 

https://www.mavenanalytics.io/blog/maven-marketing-challenge?utm_source=linkedin&utm_campaign=marketingchallenge_li_maven

# Importing neccessary libraries 

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',50)
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# Reading and exploring basic information

In [None]:
dataset = pd.read_csv("../input/marketing-data/marketing_data.csv")

In [None]:
dataset

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
#renaming column
dataset.rename(columns={" Income ": "Income"}, inplace = True )

In [None]:
dataset.isnull().sum() 

In [None]:
#checking unique values
dataset['Education'].unique().tolist()


In [None]:
dataset['Marital_Status'].unique().tolist()

In [None]:
dataset['Country'].unique().tolist()

In [None]:
#checking if target data are balanced
dataset['Response'].value_counts()

# Preprocessing

In [None]:
def preprocessing(dataset):
    dataset = dataset.copy()
    
    
    #filling nans and converting to numeric
    dataset["Income"] = dataset["Income"].replace('[$,]', '', regex=True).astype(float)
    dataset["Income"] = dataset["Income"].fillna(dataset["Income"].mean())
    
    
    #converting Dt_customer to date time
    dataset["Dt_Customer"] =  pd.to_datetime(dataset["Dt_Customer"])
    dataset["Year"]=dataset["Dt_Customer"].dt.year
    dataset["Month"]=dataset["Dt_Customer"].dt.month
    dataset["Day"]= dataset["Dt_Customer"].dt.day
    dataset = dataset.drop("Dt_Customer", axis = 1)
    
    
    #replacing 2nd cycle in Education column
    dataset["Education"] = dataset["Education"].str.replace('2n Cycle','Master')
    
    #replacing marital statuses
    dataset["Marital_Status"] = dataset["Marital_Status"].str.replace('Alone','Single')
    dataset["Marital_Status"] = dataset["Marital_Status"].str.replace('YOLO','Other')
    dataset["Marital_Status"] = dataset["Marital_Status"].str.replace('Absurd','Other')
    
    #dropping ID
    dataset = dataset.drop("ID", axis=1)
    
    #getting dummies
    dataset = pd.get_dummies(data=dataset, columns=['Education', "Marital_Status", "Country"])
    #X, y split
    X = dataset.drop("Response", axis=1)
    y = dataset["Response"]
    
    #scaling
    sc = StandardScaler()
    X = pd.DataFrame(sc.fit_transform(X), index=X.index, columns=X.columns)
    
    #train, test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
    
    #balacing data
    sm = SMOTE(random_state=2)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    return X_train_res, X_test, y_train_res, y_test

In [None]:
X_train_res, X_test, y_train_res, y_test = preprocessing(dataset)

# Fitting into XGBoost and evaluatin results before tuning

In [None]:
xg = xgb.XGBClassifier()
xg.fit(X_train_res, y_train_res)
y_pred = xg.predict(X_test)
y_pred_train = xg.predict(X_train_res)

In [None]:
print('Model Accuracy : ', accuracy_score(y_test, y_pred) *  100)
print('Model Recall : ', recall_score(y_test, y_pred) *  100)
print('Model Precision : ', precision_score(y_test, y_pred) *  100)
print("F1 Score: ", f1_score(y_test, y_pred) * 100)

In [None]:
metrics_before_tune = ("Accuracy before tune", "Recall  before tune", "Precision  before tune", "F1  before tune")

In [None]:
eval_before_tune = pd.DataFrame(xg, index=metrics_before_tune, columns=["Score"])

In [None]:
eval_before_tune.loc["Recall  before tune", "Score"] = recall_score(y_test, y_pred)
eval_before_tune.loc["Accuracy before tune", "Score"] =  accuracy_score(y_test, y_pred)
eval_before_tune.loc["Precision  before tune", "Score"] = precision_score(y_test, y_pred)
eval_before_tune.loc["F1  before tune", "Score"] = f1_score(y_test, y_pred)

In [None]:
eval_before_tune

In [None]:
print('Model Accuracy : ', accuracy_score(y_train_res, y_pred_train) *  100)
print('Model Recall : ', recall_score(y_train_res, y_pred_train) *  100)
print('Model Precision : ', precision_score(y_train_res, y_pred_train) *  100)
print("F1 Score: ", f1_score(y_train_res, y_pred_train) * 100)

# Cheking which features are important for model

In [None]:
name = X_train.columns

In [None]:
importance = pd.DataFrame(xg.feature_importances_, index = name, columns = ["Score"]).sort_values("Score", ascending = False)

In [None]:
importance

# Checking correlation among features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = X_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)
plt.show()

# Parameter tuning

In [None]:
parameters = {
        'learning_rate': [0.01, 0.5],
        'max_depth': [3, 5, 7, 10,50],
        'min_child_weight': [1, 3, 5, 10],
        'subsample': [0.1, 0.7],
        'colsample_bytree': [0.5, 0.9],
        'n_estimators' : [1, 20, 50],
        'objective': ['reg:squarederror']
    }
scorer = make_scorer(accuracy_score)

In [None]:
scorer_accuracy = make_scorer(accuracy_score)
scorer_recall = make_scorer(recall_score)
scorer_precision = make_scorer(precision_score)
scorer_f1 = make_scorer(f1_score)

In [None]:
def generate_xg_from_search(xg, parameters, scorer, X, y):
    search_obj = RandomizedSearchCV(xg, parameters, scoring=scorer)
    fit_obj = search_obj.fit(X, y)
    best_xg = fit_obj.best_estimator_
    return best_xg

In [None]:
scores = cross_val_score(best_xg, X_train_res, y_train_res, cv=5, scoring= "accuracy")
scores.mean()


In [None]:
scorer_recall = cross_val_score(best_xg, X_train_res, y_train_res, cv=5, scoring= "recall")
scorer_accuracy = cross_val_score(best_xg, X_train_res, y_train_res, cv=5, scoring= "accuracy")
scorer_precision = cross_val_score(best_xg, X_train_res, y_train_res, cv=5, scoring= "precision")
scorer_f1_score = cross_val_score(best_xg, X_train_res, y_train_res, cv=5, scoring= "f1_macro")


In [None]:
metrics_after_tune = ("Accuracy after tuning", "Recall after tuning", "Precision after tuning", "F1 after tuning")

In [None]:
eval_after_tune = pd.DataFrame(make_scorer, index=metrics_after_tune, columns=["Score"])

In [None]:
eval_after_tune.loc["Recall after tuning", "Score"] = (scorer_recall.mean() * 100)
eval_after_tune.loc["Accuracy after tuning", "Score"] = (scorer_accuracy.mean()*100)
eval_after_tune.loc["Precision after tuning", "Score"] = (scorer_f1_score.mean()*100)
eval_after_tune.loc["F1 after tuning", "Score"] = (scorer_precision.mean()*100)

In [None]:
eval_after_tune