In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import seaborn as sns

In [None]:
df = pd.read_csv("customer_data.csv")

for col in df.columns:
  df.loc[df[col]== ' ', col] = 0
# For some reason, there are some blank strings in some columns, which should be 0 values


# Cleaning data, converting categorical to one-hot encoded columns

In [None]:

df['gender'] = (df['gender'] == "Male").astype(int)
df['InternetService'] = (df['InternetService'] == "DSL").astype(int)
df['Contract'] = (df['Contract'] == "Month-to-month").astype(int)
df['TotalCharges'] = df['TotalCharges'].astype(float)


# Getting dummy cols from cols with more than two possible categorical values
multiple_lines_encoded = pd.get_dummies(df['MultipleLines'], prefix='MultipleLines')
df = pd.concat([df, multiple_lines_encoded], axis=1)

payment_method_encoded = pd.get_dummies(df['PaymentMethod'], prefix='PaymentMethod')
df = pd.concat([df, payment_method_encoded], axis=1)

df.drop(columns = ['customerID', 'MultipleLines', 'PaymentMethod'], inplace = True)



In [None]:
# Yes and no converted to binary
yes_to_int = ['Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
for label in yes_to_int:
  df[label] = (df[label] == 'Yes').astype(int)

# True or false converted to binary
t_f_to_int = ['MultipleLines_No', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)',
              'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']
for label in t_f_to_int:
 df[label] = (df[label] == True).astype(int)


# Visualizing features

In [None]:
#for label in df.drop('Churn', axis=1).columns:
#  plt.hist(df[df["Churn"] == 1][label], color="blue", label = 'Churned', alpha = 0.7, density=True)
#  plt.hist(df[df["Churn"] == 0][label], color="red", label = 'Not churned', alpha = 0.7, density=True)
#  plt.title(label)
#  plt.ylabel('Probability')
#  plt.xlabel(label)
#  plt.legend()
#  plt.show()

In [None]:
# Based on these visualizations, we can drop some columns that don't appear to have a strong association
df.drop(columns = ['gender', 'PhoneService', 'MultipleLines_No', 'MultipleLines_No phone service'], inplace = True)

# Train, validation, test datasets

In [None]:
from sklearn.metrics import classification_report

In [None]:
# We need to scale the dataset but only for features that are not binary
# In our case thats tenure, MonthlyCharges, and TotalCharges
# In the scaling function below, we include a case where we can pass in columns to ignore

def selectively_scale_dataset(dataset, binary_cols):
    # Identify numerical columns that are not binary
    numerical_cols = [col for col in dataset.columns if col not in binary_cols]

    # Scale numerical columns
    scaler = StandardScaler()
    index = dataset.index # This is really important, we have to preserve the index or else pd.concat will not work correctly
    scaled_numerical_data = scaler.fit_transform(dataset[numerical_cols])

    # Create DataFrame with scaled numerical data
    scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_cols, index=index)

    # Combine scaled numerical data with binary columns
    scaled_dataset = pd.concat([scaled_numerical_df, dataset[binary_cols]], axis=1)

    return scaled_dataset

In [None]:
# Isolate features and target columns
FEATURES = []
TARGET = ['Churn']
for col in df.columns:
  if col not in TARGET:
    FEATURES.append(col)

In [None]:
# Since most of our features have binary data we don't want to scale those features
non_binaries = {'MonthlyCharges', 'TotalCharges', 'tenure'}
binary_features = []
for cols in df.columns:
  if cols not in non_binaries:
    binary_features.append(cols)

In [None]:
def get_X_y(df, FEATURES, TARGET):
  X = df[FEATURES]
  y = df[TARGET]
  return df, X, y

In [None]:
# We will be doing cross validation, so let's split our data
# We should not scale our data before doing this though
from sklearn.model_selection import train_test_split
df, X, y = get_X_y(df, FEATURES=FEATURES, TARGET=TARGET)

In [None]:
# Splitting data into 40% for test set and 60% for our training set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.4, random_state = 0)


In [None]:
# Need to remove churn from binary features as it's not in X
X_binaries = [col for col in binary_features if col != 'Churn']
X_train = selectively_scale_dataset(X_train, binary_cols = X_binaries)
X_test = selectively_scale_dataset(X_test, binary_cols = X_binaries)
# Don't need to scale y as it's binary

In [None]:
# Transform data to the right shape
# In future could put this inside the scaling function...
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.reshape(-1)
y_test = y_test.values.reshape(-1)
y = y.values.reshape(-1)

# SVM

In [None]:
from sklearn import svm

In [None]:
svm_model = svm.SVC(kernel='rbf', C=30, gamma='auto')
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))
# Appears as though the model performs decent... but we haven't done cross-validation!
# Recall that if they were Churned, that is the '1' case

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      2071
           1       0.63      0.50      0.56       747

    accuracy                           0.79      2818
   macro avg       0.73      0.70      0.71      2818
weighted avg       0.78      0.79      0.78      2818



In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(svm_model, X, y, cv=5)
scores
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.76 accuracy with a standard deviation of 0.00


In [None]:
# Evaluating performance of multiple algorithms and parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma = 'auto'),
        'params': {
            'C' : [1],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
  clf = GridSearchCV(mp['model'], mp['params'], cv=2, return_train_score=False)
  clf.fit(X, y)
  scores.append({
      'model': model_name,
      'best_score': clf.best_score_,
      'best_params': clf.best_params_
  })