In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


In [None]:
df = pd.read_csv('customer_data.csv')

# remove customer ID as it tells us nothing
df.drop(columns=['customerID'], inplace = True)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# Convert churn into binary column
df['Churn'] = (df['Churn'] == "Yes").astype(int)


df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df['TotalCharges'] = df['TotalCharges'].astype(float)
# Remove empty values from totalcharges col
df.dropna(inplace = True)

In [None]:
df_dummies = pd.get_dummies(df, dtype = int)
df_dummies.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [None]:
y = df_dummies['Churn'].values
X = df_dummies.drop(columns = ['Churn'])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Train, test split and scaling numerical columns

In [None]:
# Splitting data into 40% for test set and 60% for our training set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.4, random_state = 0)

In [None]:
def scale_train(train_df):
  scaler = StandardScaler()
  index = train_df.index
  cols = train_df.columns
  train_df = scaler.fit_transform(train_df)
  train_df = pd.DataFrame(train_df, columns=cols, index=index)
  return train_df, scaler

In [None]:
# Send only numerical columns for scaling. Will use scaler to transform (not fit_transform) test data
numerical_cols, scaler = scale_train(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])

# Drop numerical columns (that are unscaled) from df
X_train.drop(columns=['tenure', 'MonthlyCharges', 'TotalCharges'], inplace = True)

# Reattach the now scaled numerical cols back to df
X_train = pd.concat([X_train, numerical_cols], axis=1)

In [None]:
X_train.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
4043,1,0,1,0,1,1,0,1,0,0,...,1,0,1,0,1,0,0,1.087342,-0.021099,0.703168
471,0,0,1,1,0,1,0,0,1,1,...,0,0,1,0,0,1,0,-1.274387,-0.321895,-0.979707
4467,0,1,0,0,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0.639428,0.161705,0.504988
6047,0,0,1,0,1,1,0,0,1,0,...,0,0,1,0,0,1,0,-0.785753,0.038728,-0.607611
3169,0,0,1,0,1,0,1,0,1,1,...,0,0,1,0,0,1,0,0.802305,0.135115,0.525739


In [None]:
def scale_test(test_df, scaler):
  index = test_df.index
  cols = test_df.columns
  test_df = scaler.transform(test_df)
  test_df = pd.DataFrame(test_df, columns=cols, index=index)
  return test_df

In [None]:
# Send only numerical columns for scaling. Passing scaler (from scaling training data) as a parameter as well
numerical_cols= scale_test(X_test[['tenure', 'MonthlyCharges', 'TotalCharges']], scaler)

# Drop numerical columns (that are unscaled) from df
X_test.drop(columns=['tenure', 'MonthlyCharges', 'TotalCharges'], inplace = True)

# Reattach the now scaled numerical cols back to df
X_test = pd.concat([X_test, numerical_cols], axis=1)

In [None]:
X_test.head()

Unnamed: 0,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
5561,0,0,1,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,-1.274387,-1.46359,-0.994793
5814,0,0,1,1,0,1,0,0,1,1,...,0,0,1,0,1,0,0,-0.663595,-1.465252,-0.858822
2645,0,1,0,1,0,1,0,0,1,1,...,0,1,0,0,1,0,0,-1.274387,-1.491841,-0.995166
3983,0,0,1,1,0,1,0,1,0,0,...,0,0,1,0,0,1,0,-1.274387,-1.343936,-0.993211
6438,1,0,1,1,0,1,0,0,1,0,...,0,0,1,0,0,1,0,-1.274387,0.322905,-0.971187


In [None]:
# Transform data to the right shape
X_train = X_train.values
X_test = X_test.values

# SVM

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
svm_model = svm.SVC(kernel='linear', C=30, gamma='auto')
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))
# Appears as though the model performs decent... but we haven't done cross-validation!
# Recall that if they were Churned, that is the '1' case

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      2061
           1       0.66      0.56      0.61       752

    accuracy                           0.81      2813
   macro avg       0.76      0.73      0.74      2813
weighted avg       0.80      0.81      0.80      2813



In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(svm_model, X, y, cv=5)
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.79 accuracy with a standard deviation of 0.01


# The problem we face here is that cross validation will not scale our numerical and categorical data individually. See customer churn V3!
