In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, \
    RocCurveDisplay, auc, precision_score, recall_score, f1_score
from sklearn.model_selection import RepeatedKFold, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier


In [2]:
# Import data
data = pd.read_csv("bank-additional-full.csv", delimiter=";")
data

Check the overview of the data:

In [3]:
data.info()

# Look for duplicates

In [4]:
data.duplicated().sum()

There in total 12 duplicated rows.

In [5]:
duplicate_rows = data[data.duplicated(keep=False)]
duplicate_rows

In [6]:
data.shape # before removal

Remove the duplicates because it can scew the training.

In [7]:
data = data.drop_duplicates()

In [8]:
data.duplicated().sum()

In [9]:
data.shape # after removal

# Look for NA values

Have to check for any missing values (NA), because the present of them will bring problems in training the model:

In [10]:
data.isna().sum()

In [11]:
data.isnull().sum().sum()

In [12]:
data.isnull().sum()

# Data cleaning

In [13]:
# Copy of the dataset just in case
data_new = data.copy()
data_new

In [14]:
data_new.columns

Checking how many Yes and No there is in the dependent feature or our output:

In [15]:
data_new["y"].value_counts()

Seems that there are more No than Yes values.

In [16]:
get_categorical = data_new.select_dtypes(include="object").columns
#data_new[get_categorical] = data_new[get_categorical].astype("category")

In [17]:
data_new.info()

In [18]:
categorical_columns = data_new.select_dtypes(include='object').columns
categorical_columns

In [19]:
numerical_columns = data_new.select_dtypes(include=["int64","float64"]).columns
numerical_columns

In [20]:
both_value_types = {"categorical":categorical_columns, "numerical":numerical_columns}

In [21]:
for coll in categorical_columns:
    print(data_new[coll].value_counts(),"\n")

Check numerical columns, to investigate what is happening and check for outliers.

In [22]:
plt.figure(figsize=(6, 12))
for i, col in enumerate(numerical_columns, 1):
    plt.subplot(5, 2, i)
    sns.boxplot(y=data_new[col])
    plt.title(col)
    plt.ylabel('')

plt.tight_layout()
plt.show()

Other than `campaing` and `age` having a lot of outliers, the rest seem fine. Let's check `age`:

In [23]:
plt.figure(figsize=(10, 6))
plt.hist(data_new['age'])
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Ages Among Clients')
plt.show()

The `age` seems resonable so I will try not to adjust on it.  
But I could try to deal with the ouliers in the `campaign`:

In [24]:
plt.figure(figsize=(10, 6))
plt.hist(data_new['campaign'])
plt.xlabel('Campaign')
plt.ylabel('Count')
plt.title('Campaing')
plt.show()

The data for `campaign` is very scewed to the right, so I could use **IQR method**, since it is used for left or right-skewed data based on this [article](https://www.analyticsvidhya.com/blog/2022/09/dealing-with-outliers-using-the-iqr-method/).

In [25]:
data_new['campaign'].describe()

In [26]:
percentile25 = data_new['campaign'].quantile(0.25)
percentile75 = data_new['campaign'].quantile(0.75)

print("75th quartile: ",percentile75)
print("25th quartile: ",percentile25)

In [27]:
IQR = percentile75 - percentile25
print ("IQR: ", IQR)

In [28]:
upper_limit = percentile75 + 1.5 * IQR
lower_limit = percentile25 - 1.5 * IQR

print("Upper limit",upper_limit)
print("Lower limit",lower_limit)

In [29]:
data_new[data_new['campaign'] > upper_limit]

In [30]:
data_new[data_new['campaign'] > upper_limit].count()

Number of ourlier rows is 2406.

In [31]:
new_df = data_new[data_new['campaign'] < upper_limit] # To keep the original data safe
new_df.shape

After adjustment:

In [32]:
plt.figure(figsize=(16,8))

plt.subplot(1,2,1)
sns.boxplot(y=new_df["campaign"])

plt.subplot(1,2,2)
plt.hist(new_df['campaign'])

I could check if the outcome of the previous campaign had any influence on this data.

In [33]:
sns.countplot(data=new_df, x='poutcome', hue='y')

Seems like it does have some influence in the outcome.

I am not gonna include `pdays` because it describes previous campaign and how many days has been since last contact. Also `duration` will be exluded as stated: "Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model."

In [34]:
sns.countplot(data=new_df, x='contact', hue='y') # does have some influence

In [35]:
plt.subplots(figsize=(16,8))
sns.heatmap(data_new.corr().abs(), annot=True)
plt.show()

In [36]:
plt.figure(figsize=(14,4))

plt.subplot(1,3,1)
sns.countplot(data=new_df, x='cons.price.idx', hue='y')

plt.subplot(1,3,2)
sns.countplot(data=new_df, x='cons.conf.idx', hue='y')

plt.subplot(1,3,3)
sns.countplot(data=new_df, x='emp.var.rate', hue='y')

# Engineering

In [37]:
both_value_types

In [38]:
new_df["education"].value_counts()

I will combine all of the `basic` into one `basic`:

In [39]:
basic=['basic.9y','basic.6y','basic.4y']
for years in basic:
    new_df.loc[new_df['education'] == years, 'education'] = "basic"

In [40]:
new_df["education"].value_counts()

In [41]:
new_df.shape, new_df._is_view, print(new_df._is_copy)

In [42]:
new_df["marital"].value_counts()

# Remove unnesecary columns

As mentioned previously I will remove the features that are not needed.

In [43]:
modified_df = new_df.copy()

In [44]:
# Columns to remove
remove_col = ['duration','pdays']

modified_df = modified_df.drop(remove_col, axis=1)
modified_df

# Set columns that have `yes`, `no` and `unknown` values to 1, 0, -1

I change it to 1, 0 and -1 becaue I am not sure whether the bank means `unknown` as in missing data or just part of the scale.

In [45]:
# Check which columns have yes, no and unknown values
for column in both_value_types["categorical"]:
    if column in modified_df.columns:
        print(modified_df[column].value_counts(),"\n")

The columns that have Yes and No are `default`, `housing`, `loand` and `y`.

In [46]:
columns_to_replace = ['default', 'housing', 'loan','y']
for column in columns_to_replace:
    modified_df[column] = modified_df[column].replace({'yes': 1, 'no': 0, 'unknown': -1})

In [47]:
modified_df.loc[:,['housing','default','loan','y']].head()

In [48]:
modified_df["y"].value_counts()

In [49]:
modified_df

In [50]:
modified_df.isnull().sum()

# Feature selection

In [51]:
# Taken from https://stackoverflow.com/questions/61146233/how-to-use-sklearn-chi-square-or-anova-to-removes-redundant-features
def get_feature_correlation(df, top_n=None, corr_method='spearman',
                            remove_duplicates=True, remove_self_correlations=True):
    """
    Compute the feature correlation and sort feature pairs based on their correlation

    :param df: The dataframe with the predictor variables
    :type df: pandas.core.frame.DataFrame
    :param top_n: Top N feature pairs to be reported (if None, all of the pairs will be returned)
    :param corr_method: Correlation compuation method
    :type corr_method: str
    :param remove_duplicates: Indicates whether duplicate features must be removed
    :type remove_duplicates: bool
    :param remove_self_correlations: Indicates whether self correlations will be removed
    :type remove_self_correlations: bool

    :return: pandas.core.frame.DataFrame
    """
    corr_matrix_abs = df.corr(method=corr_method).abs()
    corr_matrix_abs_us = corr_matrix_abs.unstack()
    sorted_correlated_features = corr_matrix_abs_us \
        .sort_values(kind="quicksort", ascending=False) \
        .reset_index()

    # Remove comparisons of the same feature
    if remove_self_correlations:
        sorted_correlated_features = sorted_correlated_features[
            (sorted_correlated_features.level_0 != sorted_correlated_features.level_1)
        ]

    # Remove duplicates
    if remove_duplicates:
        sorted_correlated_features = sorted_correlated_features.iloc[:-2:2]

    # Create meaningful names for the columns
    sorted_correlated_features.columns = ['Feature 1', 'Feature 2', 'Correlation (abs)']

    if top_n:
        return sorted_correlated_features[:top_n]

    return sorted_correlated_features

In [52]:
get_feature_correlation(modified_df)

Remove the `nr.employed` and `emp.var.rate` because they are related to employment in the bank and are highly correlated. Based on [this article](https://medium.com/@abdallahashraf90x/all-you-need-to-know-about-correlation-for-machine-learning-e249fec292e9) it is advised to remove highly correlated features.

In [53]:
# Columns to remove
remove_col_emp = ['nr.employed','emp.var.rate']

modified_df = modified_df.drop(remove_col_emp, axis=1)
modified_df

In [54]:
get_feature_correlation(modified_df)

# One hot code categorical values

In [55]:
both_value_types

In [56]:
modified_df.columns

In [57]:
list_to_hotcode = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']

In [58]:
modified_df

In [59]:
modified_df['poutcome'].value_counts()

In [60]:
encoded_df = pd.get_dummies(modified_df, columns=list_to_hotcode)

In [61]:
encoded_df.head()

In [62]:
encoded_df.columns

In [63]:
# Get numerical columns again

new_numerical_cl = encoded_df.select_dtypes(include=["int64","float64"]).columns
new_numerical_cl

Exclude `housing`,`default`,`loan`, `y` from the numericals

In [64]:
new_numerical_cl = ['age', 'campaign', 'previous', 'cons.price.idx', 'cons.conf.idx', 'euribor3m']

In [65]:
# Scale them using StandardScaler

scaler = StandardScaler()

encoded_df[new_numerical_cl] = scaler.fit_transform(encoded_df[new_numerical_cl])

encoded_df.head()

In [66]:
encoded_df["y"]

In [67]:
encoded_df.shape

In [68]:
encoded_df['y'].value_counts()

In [69]:
# save preprocesed data

encoded_df.to_csv("prepro_bank_data.csv")

The features selected represent the customers and influence of previous campaing towards the current one the best in my opinion. Only the ones that are very much directly related to the previous campaing, employ count in the bank and last contact is unnecessary.

# Getting input (x) and output (y)

In [70]:
# Preparing data

x_data = encoded_df.loc[:, encoded_df.columns != 'y']
x_data

In [71]:
#y_data = data[["y"]]
#y_data

y_data = encoded_df[["y"]]
y_data

In [72]:
x_data_np = x_data.to_numpy()
y_data_np = y_data.to_numpy()

In [73]:
x_data_np

In [74]:
print(y_data_np.shape)
print(x_data_np.shape)

In [75]:
y = np.ravel(y_data_np)
y.shape

# Best algorithm selection

In [76]:
seed = 50

First I will test 5 different algorhitms:

*   Logistic Regression
*   C-SVM
*   Linear SVM
*   KNN
*   Decision Tree



In [77]:
models = [('Logistic Regression', LogisticRegression(max_iter=300)),
          ('KNN', KNeighborsClassifier()),
          ('Decision Tree Classifier', DecisionTreeClassifier()),
          ('Linear Support Vector Machine', LinearSVC(dual=False)),
          ('Support Vector Machine', SVC())]

In [78]:
models

In [None]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_data_np, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    results_overviews = f"{name}: {cv_results.mean()}, ({cv_results.std()})"
    print(results_overviews)

In [None]:
scoring = 'accuracy'
kfold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=seed)
model_rf = RandomForestClassifier()
cv_results_rf = model_selection.cross_val_score(model_rf, x_data_np, y, cv=kfold, scoring=scoring)
results_overviews_rf = f"Random Forest Classifier: {cv_results_rf.mean()}, ({cv_results_rf.std()})"
print(results_overviews_rf)

In [None]:
df = pd.DataFrame(results)
df = df.transpose()
df.columns = ['LR', 'KNN', 'Decision Tree Classifier', 'Linear Support Vector Machine (LinearSVC)', 'SVM (SVC)']

In [None]:
df["Random Forest Classifier"] = cv_results_rf

In [None]:
# Save to csv
df.to_csv("model_selection_res_new.csv")

In [None]:
results_df = pd.read_csv("model_selection_res_new.csv")
results_df = results_df.drop(results_df.columns[0],axis=1)
results_df.head()

In [None]:
cl_names = results_df.columns
fig = plt.figure(figsize=(10,6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results_df)
ax.set_xticklabels(cl_names)
fig.autofmt_xdate()
plt.show()

Based on the results and the above boxplots I believe the best ones to choose would be Logistics Regression and SVM (SVC). However, KNN LinearSVC would also could be nice contenders for further investigation.

But it took very long for SVC() to run (>1h). So I will continue with LinearSVC(), which is stated by scikit learn as better algorithm for big datasets.

I will have in mind the other two algorithms: Random Forest Classifier and KNN Classifier.

# Spliting dataset

In [None]:
y_data.value_counts() # when 0 is No and 1 is Yes

In [None]:
x_data_np.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data_np, y, test_size=0.30, stratify=y, random_state=seed)

Since we have more No than Yes I will set the `stratify` parameter. `Stratify` parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter `stratify`. By splitting into train and test without stratifying, we might run into the trouble of having only the 'yes' falling into our training set, and all the 'no' falling into our test set.  

Did splitting because the importance is mentioned [here](https://stats.stackexchange.com/questions/453221/should-i-use-gridsearchcv-on-all-of-my-data-or-just-the-training-set).

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

Below I will check if the proportions of the y test and train sets are correct based on our original data:

In [None]:
unique_values_TRAIN, counts_TRAIN = np.unique(y_train, return_counts=True)

# Combine unique values and counts into a dictionary
class_counts_TRAIN = dict(zip(unique_values_TRAIN, counts_TRAIN))

# Display the result
print(class_counts_TRAIN)

In [None]:
unique_values_TEST, counts_TEST = np.unique(y_test, return_counts=True)

# Combine unique values and counts into a dictionary
class_counts_TEST = dict(zip(unique_values_TEST, counts_TEST))

# Display the result
print(class_counts_TEST)

In [None]:
y_data.value_counts()

In [None]:
print(f'Proportion of Target in the Original Data \n{y_data.value_counts() / len(y_data)}\n')
print(f'Proportion of Target in the Training Data \n 0: {class_counts_TRAIN[0] / len(y_train)} \n 1: {class_counts_TRAIN[1] / len(y_train)}\n')
print(f'Proportion of Target in the Test Data \n 0: {class_counts_TEST[0] / len(y_test)} \n 1: {class_counts_TEST[1] / len(y_test)}')

# Selection of best hyperparameters for selected models

SVC takes super long to run so I run it with standard parameters during training. But I will look for hyperparameters for LinearSVC and Logistics Regression, as well as Random Forest Classifier and KNN Classifier.

I know that we have to work with two algorhitms only, even if the SVC one based on previous testing is producing better results, however, it takes very long to finish during model selection and hyperarameters selection. And I think speed is also an important aspect when choosing the algorhitm.

In [None]:
kf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=seed) # setting cross validation

In [None]:
model_params = {
    'logistic_regression' : {
        'model' : LogisticRegression(random_state=seed, max_iter=1200),
        'params' : {
            'C': np.logspace(-4, 4, 10), 'solver': ["lbfgs", "liblinear", "sag", "saga"]
        }
    },
    'LSvm': {
        'model' : LinearSVC(random_state=seed, dual=False), # dual = False, is when n_samples > n_features
        'params' : {
            'C': np.logspace(-4, 4, 10)
        }
    },
}

In [None]:
for _name, parameters in model_params.items():
    print(parameters)

In [None]:
scores = []

for _name, parameters in model_params.items():
    gscv = GridSearchCV(parameters['model'], parameters['params'], cv=kf, return_train_score=False)
    gscv.fit(X_train, y_train)

    scores.append({
        'model': _name,
        'best_score': gscv.best_score_,
        'best_params': gscv.best_params_,
        'best_estimator': gscv.best_estimator_})
    print(f"{_name}, Mean accuracy: {gscv.score(X_test,y_test)}\n") #evaluates on the best estimator

In [None]:
df_best_model = pd.DataFrame(scores)
df_best_model

`grid.best_score_` is the average of all cv folds for a single combination of the parameters specified

In [None]:
df_best_model.best_params[0] # want to see the whole in case something is missing

In [None]:
# Save to csv
df_best_model.to_csv("df_best_model_scores.csv")

In [None]:
df_best_model = pd.read_csv("df_best_model_scores.csv")
df_best_model

For KNN and RandomForest

In [None]:
model_params2 = {
    'KNN' : {
        'model' : KNeighborsClassifier(),
        'params' : {
            'n_neighbors': range(1,10)
        }
    },
    'Random Forest': {
        'model' : RandomForestClassifier(random_state=seed),
        'params' : {
            'criterion' :['gini', 'entropy']
        }
    },
}

In [None]:
for _name2, parameters2 in model_params2.items():
    print(parameters2)

In [None]:
scores2 = []

for _name2, parameters2 in model_params2.items():
    gscv2 = GridSearchCV(parameters2['model'], parameters2['params'], cv=kf, return_train_score=False)
    gscv2.fit(X_train, y_train)

    scores2.append({
        'model': _name2,
        'best_score': gscv2.best_score_,
        'best_params': gscv2.best_params_,
        'best_estimator': gscv2.best_estimator_})
    print(f"{_name2}, Mean accuracy: {gscv2.score(X_test,y_test)}\n") #evaluates on the best estimator

In [None]:
df_best_model2 = pd.DataFrame(scores2)
df_best_model2

In [None]:
# Save to csv
df_best_model2.to_csv("df_best_model_scores2.csv")

# Modelling

The first part is training and testing the first two best algorhithms selected: LinearSVC (because SVC was slow during hyperparameters) and Logistic Regression.

The next part has oversampling with SMOTE. Explanation will come later.

In [None]:
# Checking split

kfold1 = StratifiedKFold(n_splits=5,shuffle=True,random_state=seed)
splits = kfold1.split(encoded_df, y_data)
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA \n{y_data.value_counts() / len(y_data)}')
for n,(train_index1,test_index1) in enumerate(splits):
    #print(encoded_df)
    print('\n')
    print(f'Split #{n+1}')
    print(f'Training set size: {np.round(len(train_index1) / (len(train_index1)+len(test_index1)),2)}')
    print(f'Test set size: {np.round(len(test_index1) / (len(train_index1)+len(test_index1)),2)}')
    print(f'Proportion of Target in the Training SET:\n{encoded_df.iloc[test_index1,9].value_counts() / len(encoded_df.iloc[test_index1,9])}')
    print(f'Proportion of Target in the Test SET:\n{encoded_df.iloc[train_index1,9].value_counts() / len(encoded_df.iloc[train_index1,9])}')

All the functions needed:

In [None]:
def model_fitting(model_inputed, X, y, sk_f):

    y_true_together = []
    y_pred_together = []

    y_true_each = []
    y_pred_each = []

    for train_index, test_index in sk_f.split(X, y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        print(f'Proportion of Target in the Training SET:\n{y_data.value_counts() / len(y)}')
        print(f'Proportion of Target in the Test SET:\n{y_data.value_counts() / len(y)}')

        model_ = model_inputed
        model_.fit(X_train, y_train)

        y_pred_in = model_.predict(X_test)

        y_true_together.extend(y_test)
        y_pred_together.extend(y_pred_in)

        y_true_each.append(y_test)
        y_pred_each.append(y_pred_in)

    return y_true_each, y_pred_each, y_true_together, y_pred_together

In [None]:
# Calculate accuracy of each fold

def accuracy_of_folds(y_true, y_pred):
    all_accuracies = []

    for index in range(len(y_true)):
        accuracy = accuracy_score(y_true[index], y_pred[index])
        all_accuracies.append(accuracy)

    print(f"Accuracies of all {len(y_true)} folds:")
    return all_accuracies

In [None]:
def classification_report_of_each_fold(y_true, y_pred):
    for fold in range(len(y_true)):
        print(classification_report(y_true[fold], y_pred[fold]))

In [None]:
def ROC_of_folds(X, y, model):

    tprs, aucs = [], []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots(figsize=(10, 8))
    for index, (train, test) in enumerate(skf.split(X, y)):
        model.fit(X[train], y[train])
        plot = RocCurveDisplay.from_estimator(
            model, X[test], y[test],
            name="ROC fold {}".format(index),
            ax=ax,
        )
        interp_tpr = np.interp(mean_fpr, plot.fpr, plot.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(plot.roc_auc)

    ax.set(
        xlim=[-0.05, 1.05],
        ylim=[-0.05, 1.05],
        title="Receiver operating characteristic with CV"
    )
    plt.plot([0,1],[0,1], linestyle="dashed", label="Chance level (AUC = 0.5)")
    plt.legend()
    plt.show()

Using Stratified CV because we have uneven amount of Yes and No values in our y.

In [None]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=seed) # stratified cross validation

# Logistic Regression

In [None]:
df_best_model_LR_LSVC = pd.read_csv("df_best_model_scores.csv")

In [None]:
print(df_best_model_LR_LSVC.head(1).best_estimator.values) # for log reg

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_LR = LogisticRegression(C=10000.0, max_iter=1200, random_state=50, solver='liblinear') # best model with best hyperparameters

y_true_LR_each, y_pred_LR_each, y_true_LR_tog, y_pred_LR_tog = model_fitting(model_LR, x_data_np, y, skf)

In [None]:
accuracy_of_folds(y_true_LR_each, y_pred_LR_each)

In [None]:
accuracy_score(y_true_LR_tog, y_pred_LR_tog)*100

<u>Mean accuracy of all folds:</u> 89.57423725225583 %

In [None]:
confusion_matrix(y_true_LR_tog, y_pred_LR_tog)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true_LR_tog, y_pred_LR_tog))
disp.plot()

In [None]:
print(classification_report(y_true_LR_tog, y_pred_LR_tog))

In [None]:
classification_report_of_each_fold(y_true_LR_each, y_pred_LR_each)

In [None]:
ROC_of_folds(x_data_np, y, model_LR)

**Logistic Regression without cross validation:**

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_LR.fit(X_train, y_train)
y_pred_LR_noCV = model_LR.predict(X_test)
print(f"Model accuracy: {(accuracy_score(y_test, y_pred_LR_noCV))*100} %")

<u>Model accuracy:</u> 89.2397248191921  %

In [None]:
confusion_matrix(y_pred_LR_noCV, y_test)

In [None]:
print(classification_report(y_test, y_pred_LR_noCV))

In [None]:
# https://www.sharpsightlabs.com/blog/scikit-learn-roc_curve/
test_fpr_LR, test_tpr_LR, thresholds = roc_curve(y_test, model_LR.predict_proba(X_test)[:,1])
train_fpr_LR, train_tpr_LR, thresholds = roc_curve(y_train, model_LR.predict_proba(X_train)[:,1])

In [None]:
plt.plot(train_fpr_LR, train_tpr_LR, label = f"AUC TRAIN = {round(auc(train_fpr_LR, train_tpr_LR), ndigits=2)}")
plt.plot(test_fpr_LR, test_tpr_LR, label = f"AUC TEST = {round(auc(test_fpr_LR, test_tpr_LR), ndigits=2)}")
plt.plot([1,0],[1,0], linestyle = "dashed", label="Chance level (AUC = 0.5)")
plt.title("ROC curve for Logistics Regression")
plt.xlabel("True positive rate")
plt.ylabel("False positive rate")
plt.xlim(0,1)   # set the xlim to left, right
plt.ylim(0,1)
plt.legend()
plt.show()

## LinearSVC

In [None]:
df_best_model_LR_LSVC.best_estimator[1]

In [None]:
model_L_SVC = LinearSVC(C=0.046415888336127774, dual=False, random_state=50)

y_true_L_SVC_each, y_pred_L_SVC_each, y_true_L_SVC_tog, y_pred_L_SVC_tog = model_fitting(model_L_SVC, x_data_np, y, skf) # using same stratified CV

In [None]:
accuracy_of_folds(y_true_L_SVC_each, y_pred_L_SVC_each)

In [None]:
accuracy_score(y_true_L_SVC_tog, y_pred_L_SVC_tog)*100

<u>Mean accuracy of all folds:</u> 89.48426874123469 %

In [None]:
confusion_matrix(y_true_L_SVC_tog, y_pred_L_SVC_tog)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true_L_SVC_tog, y_pred_L_SVC_tog))
disp.plot()

In [None]:
print(classification_report(y_true_L_SVC_tog, y_pred_L_SVC_tog))

In [None]:
classification_report_of_each_fold(y_true_L_SVC_each, y_pred_L_SVC_each)

In [None]:
ROC_of_folds(x_data_np, y, model_L_SVC)

**LinearSVC without cross validation:**

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_L_SVC.fit(X_train, y_train)
y_pred_L_SVC_noCV = model_L_SVC.predict(X_test)
print(f"Model accuracy: {(accuracy_score(y_test, y_pred_L_SVC_noCV))*100} %")

<u>Accuracy of all folds:</u> 89.18680543305697  %

In [None]:
confusion_matrix(y_pred_L_SVC_noCV, y_test)

In [None]:
print(classification_report(y_test, y_pred_L_SVC_noCV))

In [None]:
x_train_pred_L_SVC = model_L_SVC.decision_function(X_train)
x_test_pred_L_SVC = model_L_SVC.decision_function(X_test)

train_fpr_L_SVC, train_tpr_L_SVC, tr_thresholds_SVC = roc_curve(y_train, x_train_pred_L_SVC)
test_fpr_L_SVC, test_tpr_L_SVC, te_thresholds_SVC = roc_curve(y_test, x_test_pred_L_SVC)

In [None]:
plt.plot(train_fpr_L_SVC, train_tpr_L_SVC, label = f"AUC TRAIN = {round(auc(train_fpr_L_SVC, train_tpr_L_SVC), ndigits=2)}")
plt.plot(test_fpr_L_SVC, test_tpr_L_SVC, label = f"AUC TEST = {round(auc(test_fpr_L_SVC, test_tpr_L_SVC), ndigits=2)}")
plt.plot([1,0],[1,0], linestyle = "dashed", label="Chance level (AUC = 0.5)")
plt.xlabel("True positive rate")
plt.ylabel("False positive rate")
plt.title("ROC curve for LinearSVC")
plt.xlim(0,1)   # set the xlim to left, right
plt.ylim(0,1)
plt.legend()
plt.show()

## SVC based on default parameters



In [None]:
svc_model = SVC()

svc_model.fit(X_train, y_train)

y_pred_SVC = svc_model.predict(X_test)
print(f"Model accuracy: {(accuracy_score(y_test, y_pred_SVC))*100} %")

In [None]:
confusion_matrix(y_pred_SVC, y_test)

In [None]:
print(classification_report(y_test, y_pred_SVC))

In [None]:
x_train_pred_SVC = svc_model.decision_function(X_train)
x_test_pred_SVC = svc_model.decision_function(X_test)

train_fpr_SVC, train_tpr_SVC, tr_thresholdsSVC = roc_curve(y_train, x_train_pred_SVC)
test_fpr_SVC, test_tpr_SVC, te_thresholdsSVC = roc_curve(y_test, x_test_pred_SVC)

In [None]:
plt.plot(train_fpr_SVC, train_tpr_SVC, label = f"AUC TRAIN = {round(auc(train_fpr_SVC, train_tpr_SVC), ndigits=2)}")
plt.plot(test_fpr_SVC, test_tpr_SVC, label = f"AUC TEST = {round(auc(test_fpr_SVC, test_tpr_SVC), ndigits=2)}")
plt.plot([1,0],[1,0], linestyle = "dashed", label="Chance level (AUC = 0.5)")
plt.xlabel("True positive rate")
plt.ylabel("False positive rate")
plt.title("ROC curve for SVC")
plt.xlim(0,1)   # set the xlim to left, right
plt.ylim(0,1)
plt.legend()
plt.show()

Both Logistics Regression and LinearSVC demonstrate a challenge in accurately predicting clients who will subscribe (Yes (1)). Same goes for SVC.  

With `class_weight= {1:0.75, 0:0.25}` Precision and Recall are around 50 %.

Without weighted `class_weight= {1:0.75, 0:0.25}` values for Yes (1) Precision dominates compared to Recall: Precision is about 60 % while Recall 20 %, meanining 60 % of customers that we correctly identify as subscribers out of all the customers actually subscribed. And for all the customers who actually subscribed, recall tells us how many we correctly identified as being subscribers.

# Oversampling with SMOTE

Since LinearSVC and Logistic Regression did not do very well when identifying the Yes (1) class. I thought I should try with SMOTE oversampling after trying to figure out and researching on how to tackle unbalanced data. For example, [this article](https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/) helped.

Also, since I have tested the other algorhithms on the data without test/train split (during the selection of best algorithm), I thought I should try K Nearest Neighbor Classifier and Random Forest Classifier out additionally, because both of them and the other models above had quite similar accuracy results. Well except Decision Tree Classifier.

In [None]:
# What is the imbalance?
y.mean()

In [None]:
df_best_model_KNN_RF = pd.read_csv("df_best_model_scores2.csv")
df_best_model_KNN_RF

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

In [None]:
# All models

model_LR = LogisticRegression(C=10000.0, max_iter=1200, random_state=seed, solver='liblinear', class_weight="balanced") # best model with best hyperparameters

model_L_SVC = LinearSVC(C=0.046415888336127774, dual=False, random_state=seed, class_weight="balanced")

svc_model = SVC()

model_KNNC = KNeighborsClassifier(n_neighbors=8)

model_RF = RandomForestClassifier(random_state=seed, criterion='entropy')

We should oversample only our training set, or use only oversampled train data set. Because when testing we want to test our model on unseen data, and that data can either be balanced or unbalanced in real life.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data_np, y, test_size=0.3, stratify=y, random_state=seed) # split the original data into train and test
over_X_train, over_y_train = oversample.fit_resample(X_train, y_train)

over_y_train.mean()

In [None]:
print(over_X_train.shape)
print(over_y_train.shape)

print()

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

In [None]:
np.unique(over_y_train, return_counts=True)

In [None]:
np.unique(y_test, return_counts=True)

In [None]:
len(over_X_train)

In [None]:
x_data_np.shape

**Logistic Regression**

In [None]:
model_LR.fit(over_X_train, over_y_train)
y_pred_LR = model_LR.predict(X_test)
print(classification_report(y_test, y_pred_LR))

**Linear SVC**

In [None]:
model_L_SVC.fit(over_X_train, over_y_train)
y_pred_L_SVC = model_L_SVC.predict(X_test)
print(classification_report(y_test, y_pred_L_SVC))

**SVC**

In [None]:
svc_model.fit(over_X_train, over_y_train)
y_pred_SVC = svc_model.predict(X_test)
print(classification_report(y_test, y_pred_SVC))

The above 3 are giving the almost the same results as previously. Except now Recall is better than Precision, as well as F-1 score is higher than before oversampling
  
I will test KNN Classifier and Random Forest next:

**K Nearest Neighbor CLassifier**  

Before oversampling:

In [None]:
# When K=8
model_KNNC.fit(X_train, y_train)
y_pred_KNNC = model_KNNC.predict(X_test)
print(classification_report(y_test, y_pred_KNNC))

After oversampling:

In [None]:
skf_ov = StratifiedKFold(n_splits=10,shuffle=True,random_state=seed) # stratified cross validation

In [None]:
k_s = range(1,20)
for k in k_s:

    cross_val_f1_score_all = []
    cross_val_accuracy_all = []
    cross_val_recall_all = []
    cross_val_precision_all = []

    for train_index_ls, validation_index_ls in skf_ov.split(X_train, y_train):
        X_train_, X_validation = X_train[train_index_ls], X_train[validation_index_ls]
        target_train, target_val = y_train[train_index_ls], y_train[validation_index_ls]

        # Split only the train set
        X_train_ov, y_train_ov = oversample.fit_resample(X_train_, target_train)
        print(X_train_ov.shape, y_train_ov.shape)

        model_KNNC_cv = KNeighborsClassifier(n_neighbors=k)
        model_KNNC_cv.fit(X_train_ov, y_train_ov)
        validation_preds = model_KNNC_cv.predict(X_validation)
        cross_val_recall_all.append(recall_score(target_val, validation_preds))
        cross_val_accuracy_all.append(accuracy_score(target_val, validation_preds))
        cross_val_precision_all.append(precision_score(target_val, validation_preds))
        cross_val_f1_score_all.append(f1_score(target_val, validation_preds))

    print(f'K={k}')
    print (f'Cross validated accuracy: {np.mean(cross_val_accuracy_all)}')
    print (f'Cross validated recall score: {np.mean(cross_val_recall_all)}')
    print (f'Cross validated precision score: {np.mean(cross_val_precision_all)}')
    print (f'Cross validated f1_score: {np.mean(cross_val_f1_score_all)}\n')

Based on accuracy K=2 is the best

In [None]:
# With K = 2
model_KNNC_4 = KNeighborsClassifier(n_neighbors=2)
model_KNNC_4.fit(over_X_train, over_y_train)
y_pred_KNNC_4 = model_KNNC_4.predict(X_test)
print(classification_report(y_test, y_pred_KNNC_4))

After oversampling both Recall and F-1 score increased. But of course Precision went down. KNN is not the best here.

**Random Forest**

Before using oversample:

In [None]:
model_RF.fit(X_train, y_train)
y_pred_RF = model_RF.predict(X_test)
print(classification_report(y_test, y_pred_RF))

After using oversample:

In [None]:
cross_val_f1_score_lst = []
cross_val_accuracy_lst = []
cross_val_recall_lst = []
cross_val_precision_lst = []

for train_index_ls, validation_index_ls in skf_ov.split(X_train, y_train):
    # keeping validation set apart and oversampling in each iteration using smote
    train, validation = X_train[train_index_ls], X_train[validation_index_ls]
    target_train, target_val = y_train[train_index_ls], y_train[validation_index_ls]

    X_train_res, y_train_res = oversample.fit_resample(train, target_train)
    print (X_train_res.shape, y_train_res.shape)

    # training the model on oversampled 4 folds of training set

    model_RF.fit(X_train_res, y_train_res)
    # testing on 1 fold of validation set
    validation_preds = model_RF.predict(validation)
    cross_val_recall_lst.append(recall_score(target_val, validation_preds))
    cross_val_accuracy_lst.append(accuracy_score(target_val, validation_preds))
    cross_val_precision_lst.append(precision_score(target_val, validation_preds))
    cross_val_f1_score_lst.append(f1_score(target_val, validation_preds))
print ('Cross validated accuracy: {}'.format(np.mean(cross_val_accuracy_lst)))
print ('Cross validated recall score: {}'.format(np.mean(cross_val_recall_lst)))
print ('Cross validated precision score: {}'.format(np.mean(cross_val_precision_lst)))
print ('Cross validated f1_score: {}'.format(np.mean(cross_val_f1_score_lst)))

In [None]:
model_RF.fit(over_X_train, over_y_train)
y_pred_RF = model_RF.predict(X_test)
print(classification_report(y_test, y_pred_RF))

F-1 score increased when using oversampling for Random forest.

At the end I chose quite a few classification models because they are more well know like Logistics Regression, Support Vector Machine, KNN Classifier and Random Forest. I did manage to reach the accuracy above 80 % on the first round, however because of data inbalance Precision and Recall suffered for the Yes class (1). I tried to overcome it with different tipe of data filtering and feature engineering, however it gave either similar results or accuracy became lower than 70 %. I believe getting more data points would be beneficial.  

Scientific bottlenecks were the unbalanced low precision and recall on class Yes. I tried to overcome it by doing oversampling with SMOTE. I found that oversampling is done more often than undersampling. And I think the best performing model was Random Forest after SMOTE oversampling.