# Scenario

A Telecom company wants to predict the customer churn. We are going to compare the output of this model's accuracy with other ML algorithm to determine which one is the best fit for the sample data.

In [1]:
# Import the necessary packages
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

# Read the data file with pandas 
data = pd.read_csv(r'C:\Users\patri\Desktop\LSE_CareerAccelerator\Course 3\Week 2\Data\customer_data.csv')

# Sense check the data
data.info()

  from pandas import Int64Index as NumericIndex


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Age            41188 non-null  int64  
 1   Occupation     41188 non-null  object 
 2   Status         41188 non-null  object 
 3   Edu            41188 non-null  object 
 4   House          41188 non-null  object 
 5   Loan           41188 non-null  object 
 6   Comm           41188 non-null  object 
 7   Month          41188 non-null  object 
 8   DOW            41188 non-null  object 
 9   Duration       41188 non-null  int64  
 10  Campaign       41188 non-null  int64  
 11  PosDays        41188 non-null  int64  
 12  Last_out       41188 non-null  object 
 13  Var_rate       41188 non-null  float64
 14  Price_idx      41188 non-null  float64
 15  Conf_idx       41188 non-null  float64
 16  Month_rate     41188 non-null  float64
 17  Quarterly_emp  41188 non-null  float64
 18  Target

In [2]:
# We need to remove the periods in the EDU column. We will create two lists and run a for loop to replace the values

# Create two lists: one with initial and one with new values.
intial_vals = ['illiterate', 'unknown', 'basic', 'high', 'university', 'professional']
new_vals = ['other', 'other', 'pre-school', 'high-school', 'uni', 'masters']

# Create a for loop to replace the values.
for old_val, new_val in zip(intial_vals, new_vals):
    data.loc[data['Edu'].str.contains(old_val),'Edu' ] = new_val

# Display all the unique values/check changes.
data['Edu'].unique()  

array(['pre-school', 'other', 'uni', 'high-school', 'masters'],
      dtype=object)

In [3]:
# Name the new DataFrame as cat_vars and specify the column names that we want to transform into dummy variables.
cat_vars = ['Occupation', 'Status', 'Edu', 'House', 'Loan', 'Comm', 'Month', 'DOW', 'Last_out']

# Use the for loop keyword to specify what actions to apply to all the items 
for var in cat_vars:
    # The category list = 'var' + '_' + 'var'
    # Specify the details of the categorical list
    cat_list = pd.get_dummies(data[var], prefix = var)
    # Indicate the joining of the DataFrames
    data = data.join(cat_list)

df_fin = data.drop(cat_vars, axis = 1)

# Specify the column names:
cat_vars = ['Occupation', 'Status', 'Edu', 'House', 'Loan', 'Comm', 'Month', 'DOW', 'Last_out']

# Set a temporary DataFrame and add values
df_vars = data.columns.values.tolist()

# Indicate what columns are kept
to_keep = [i for i in df_vars if i not in cat_vars]

# Define a new DataFrame
df_fin = data[to_keep]

# Print the colum
df_fin.columns.values

df_fin.describe()

Unnamed: 0,Age,Duration,Campaign,PosDays,Var_rate,Price_idx,Conf_idx,Month_rate,Quarterly_emp,Target,...,Month_oct,Month_sep,DOW_fri,DOW_mon,DOW_thu,DOW_tue,DOW_wed,Last_out_failure,Last_out_nonexistent,Last_out_success
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,...,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.081886,93.575664,-40.5026,3.621291,5167.035911,0.112654,...,0.017432,0.013839,0.190031,0.206711,0.209357,0.196416,0.197485,0.103234,0.863431,0.033335
std,10.42125,259.279249,2.770014,186.910907,1.57096,0.57884,4.628198,1.734447,72.251528,0.316173,...,0.130877,0.116824,0.39233,0.404951,0.406855,0.397292,0.398106,0.304268,0.343396,0.179512
min,17.0,0.0,1.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,102.0,1.0,999.0,-1.8,93.075,-42.7,1.344,5099.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,38.0,180.0,2.0,999.0,1.1,93.749,-41.8,4.857,5191.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,47.0,319.0,3.0,999.0,1.4,93.994,-36.4,4.961,5228.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,98.0,4918.0,56.0,999.0,1.4,94.767,-26.9,5.045,5228.1,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Balance the data

We need to apply the SMOTE process to balance the data. We first need to identify the column that will determine whether the dataset is balanced. 

In [4]:
# Create a DataFrame to used and replace missing values with zero
df_fin = df_fin.fillna(0)

# Select the nessecary columns:
nec_cols = ['Status_divorced', 'Status_married', 'Status_single', 'Status_unknown',
            'Edu_high-school', 'Edu_masters', 'Edu_other', 'Edu_pre-school', 'Edu_uni',
            'House_no', 'House_unknown', 'House_yes',
            'Loan_no', 'Loan_unknown', 'Loan_yes',
            'DOW_fri', 'DOW_mon', 'DOW_thu', 'DOW_tue', 'DOW_wed']

X = df_fin[nec_cols]
y = df_fin['Target']

# Create a new DataFrame and apply the SMOTE technique as target is not balanced
os = SMOTE(random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Specify the column values
columns = X_train.columns
# Specify the new data sets
os_data_X, os_data_y = os.fit_resample(X_train, y_train)

# Create two DataFrames for X and one for y:
os_data_X = pd.DataFrame(data = os_data_X, columns = columns)
os_data_y = pd.DataFrame(data = os_data_y, columns = ['Target'])

# Print the DataFrame
print('length of oversampled data is ', len(os_data_X))

os_data_y

length of oversampled data is  51134


Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
51129,1
51130,1
51131,1
51132,1


In [5]:
os_data_y.value_counts()

Target
0         25567
1         25567
dtype: int64

### Build and apply the SVM model

In [6]:
# Import the svm package from the sklearn library
from sklearn import svm
from sklearn.metrics import confusion_matrix

# Create an svm classifier using a linear kernel
clf = svm.SVC(kernel = 'linear', gamma = 'scale')

# Train the model using the training sets
clf.fit(os_data_X, os_data_y)

# Predict the response for the test data set. 
y_pred = clf.predict(X_test)

### Determine the accuracy of the model

After the model was built and fitted to the data set we can prepare the confusion matrix and accuracy report to determine how the SVM and BLR models compare. 

In [7]:
# Import the scikit-learn metrics module
from sklearn.metrics import confusion_matrix
from sklearn import metrics

# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

# Specify model accuracy: how often the classifier is correct
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))

# Specify model precision: what percentage of positive suples are labelled positive?
print('Precision:', metrics.precision_score(y_test, y_pred))

# Specify model recall: how good is the model at correctly predicting the positive classes?
print('Recall:', metrics.recall_score(y_test, y_pred))

[[7360 3621]
 [ 789  587]]
Accuracy: 0.6431172614712308
Precision: 0.13949619771863117
Recall: 0.4265988372093023
