In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv('customer_data.csv')

In [9]:
# Convert churn into binary column
df['Churn'] = (df['Churn'] == "Yes").astype(int)


df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df['TotalCharges'] = df['TotalCharges'].astype(float)
# Remove empty values from totalcharges col
df.dropna(inplace = True)
df.drop(columns=['customerID'], inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
# Splitting data into 40% for test set and 60% for our training set
y = df['Churn'].values
X = df.drop(columns = ['Churn'])

In [11]:
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_columns = X.drop(columns = ['tenure', 'MonthlyCharges', 'TotalCharges']).columns

In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
categorical_preprocessor = OneHotEncoder(sparse_output = False, handle_unknown='ignore')
numerical_preprocessor = StandardScaler()

In [13]:
from sklearn.compose import ColumnTransformer

In [14]:
preprocessor = ColumnTransformer(
    [
        ('one_hot_encoder', categorical_preprocessor, categorical_columns),
        ('standard_scaler', numerical_preprocessor, numerical_columns)
    ]
)

# SVM without cross-validation

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import svm

In [16]:
# Splitting data into 40% for test set and 60% for our training set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.4, random_state = 0)

In [17]:
# Fit and transform preprocessing steps on training data
X_train = preprocessor.fit_transform(X_train)

# Transform testing data using the same preprocessor fitted on training data
X_test = preprocessor.transform(X_test)

In [18]:
from sklearn.metrics import classification_report

svm_model = svm.SVC()
svm_model = svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      2061
           1       0.68      0.51      0.58       752

    accuracy                           0.80      2813
   macro avg       0.76      0.71      0.73      2813
weighted avg       0.79      0.80      0.80      2813



# SVM with cross-validation

In [19]:
# This pipeline will allow us to properly preproces just the data we want
# Applies OneHotEncoder just to categorical columns, and StandardScaler just to numerical columns
svm_pipe = make_pipeline(preprocessor, svm.SVC())
svm_pipe


In [20]:
from sklearn.model_selection import cross_val_score, KFold

In [21]:
cv = KFold(n_splits = 5, shuffle = True)

In [22]:
scores = cross_val_score(svm_pipe, X, y, cv=cv)

In [33]:
scores

0.8017626761538781

# Using GridSearchCV to look at multiple models

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [31]:
svm_pipe = make_pipeline(preprocessor, svm.SVC())
lgr_pipe = make_pipeline(preprocessor, LogisticRegression(solver='liblinear', multi_class='auto'))
forest_pipe = make_pipeline(preprocessor, RandomForestClassifier())

In [35]:
models = {'svm':svm_pipe,
          'lgr':lgr_pipe,
          'rdf':forest_pipe}

In [37]:
def evaluate_models(models={}):
  cv = KFold(n_splits = 5, shuffle = True)
  model_results = {}
  for model in models:
    model_results[model] = np.mean(cross_val_score(models[model], X, y, cv=cv))

  return model_results


In [38]:
res = evaluate_models(models=models)

In [42]:
# Logistic regression is the winner
for model, score in res.items():
    print(f"{model}: {score}")

svm: 0.7997713121043836
lgr: 0.8048939411861642
rdf: 0.7898146940566423
