# Marketing Classifier
Will use UCI ML dataset from https://archive.ics.uci.edu/ml/datasets/bank+marketing    


Useful links

- https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.score
- https://discuss.analyticsvidhya.com/t/how-to-exclude-the-elements-from-the-legend-in-python/5393
- https://machinelearningmastery.com/framework-for-imbalanced-classification-projects/
- https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
- https://towardsdatascience.com/machine-learning-classification-with-python-for-direct-marketing-2da27906ddac
- https://github.com/kunalBhashkar/Bank-Marketing-Data-Set-Classification/blob/master/Claffication_of_Bank_Marketing_Data_Set.ipynb



## Set Up

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/bank-additional/bank-additional/bank-additional-full.csv', sep=";")#delimiter=';', decimal=',')

In [None]:
df.size

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.groupby('poutcome')['poutcome'].count()

### Data Prep

1. Cleaning
2. Feature engineering

In [None]:
# Any nulls?
df.isnull().sum()

In [None]:
# Assume there was some EDA and feature analysis to select below
cat_feature_cols = ["marital", "education", "contact", "default", "housing", "loan", "poutcome"]
num_feature_cols = ["age", "pdays", "previous", "emp.var.rate", "euribor3m", "nr.employed"]
feature_cols = cat_feature_cols + num_feature_cols 

In [None]:
X = df[feature_cols].copy()
y = df['y'].apply(lambda x: 1 if x=='yes' else 0).copy()

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=42)

In [None]:
X_train[cat_feature_cols]

## Feature Engineering

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = enc.fit_transform(X_train[cat_feature_cols])
X_test_cat_encoded = enc.transform(X_test[cat_feature_cols])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_feature_cols])
X_test_num_scaled = scaler.transform(X_test[num_feature_cols])

In [None]:
X_train_cat_encoded.shape, X_train_num_scaled.shape

In [None]:
X_train = np.concatenate((X_train_cat_encoded.toarray(), X_train_num_scaled), axis=1)
X_test = np.concatenate((X_test_cat_encoded.toarray(), X_test_num_scaled), axis=1)

In [None]:
X_test.shape, X_train.shape

### Imbalanced Classes

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE()
X_balanced,y_balanced=sm.fit_sample(X_train,y_train)

## Classifier

In [None]:
# Random Forest Classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
rfc = RandomForestClassifier(n_estimators=1000)

In [None]:
rfc.fit(X_balanced, y_balanced)

In [None]:
f1_score(y_test.values, rfc.predict(X_test))


In [None]:
y_balanced.value_counts()

In [None]:
y_test.value_counts()

## Hyperparam search

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
if len(X_balanced)<1000: # can get too long!
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(
        estimator = rf, 
        param_distributions = random_grid, 
        n_iter = 10, 
        cv = 3, 
        verbose=2, 
        random_state=42, 
        n_jobs = -1,
        scoring='f1'
    )# Fit the random search model
    rf_random.fit(X_balanced, y_balanced)
    print(rf_random.best_score_)
    print(f1_score(y_test.values, rf_random.best_estimator_.predict(enc.transform(X_test))))
    print(f1_score(y_balanced.values, rf_random.best_estimator_.predict(X_balanced)))
else: 
    pass