![](images/EscUpmPolit_p.gif "UPM")

## Load and clean

In [1]:
# General import and load data
import pandas as pd
import numpy as np

from pandas import Series, DataFrame

# Training and test spliting
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

# Estimators
from sklearn.svm import SVC

# Evaluation
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Optimization
from sklearn.grid_search import GridSearchCV

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)


# if matplotlib is not set inline, you will not see plots
#alternatives auto gtk gtk2 inline osx qt qt5 wx tk
#%matplotlib auto
#%matplotlib qt
%matplotlib inline
%run plot_learning_curve



In [2]:
df = pd.read_csv('./data-churn/train.csv')

df.loc[df["gender"] == "Male", "gender"] = 0
df.loc[df["gender"] == "Female", "gender"] = 1

df.loc[df["Partner"] == "Yes", "Partner"] = 1
df.loc[df["Partner"] == "No", "Partner"] = 0

df.loc[df["Dependents"] == "Yes", "Dependents"] = 1
df.loc[df["Dependents"] == "No", "Dependents"] = 0

df.loc[df["PhoneService"] == "Yes", "PhoneService"] = 1
df.loc[df["PhoneService"] == "No", "PhoneService"] = 0

df.loc[df["MultipleLines"] == "No", "MultipleLines"] = 0
df.loc[df["MultipleLines"] == "No phone service", "MultipleLines"] = 0
df.loc[df["MultipleLines"] == "Yes", "MultipleLines"] = 1

df.loc[df["InternetService"] == "No", "InternetService"] = 0
df.loc[df["InternetService"] == "DSL", "InternetService"] = 1
df.loc[df["InternetService"] == "Fiber optic", "InternetService"] = 2

df.loc[df["OnlineSecurity"] == "No", "OnlineSecurity"] = 0
df.loc[df["OnlineSecurity"] == "No internet service", "OnlineSecurity"] = 0
df.loc[df["OnlineSecurity"] == "Yes", "OnlineSecurity"] = 1

df.loc[df["OnlineBackup"] == "No", "OnlineBackup"] = 0
df.loc[df["OnlineBackup"] == "No internet service", "OnlineBackup"] = 0
df.loc[df["OnlineBackup"] == "Yes", "OnlineBackup"] = 1

df.loc[df["DeviceProtection"] == "No", "DeviceProtection"] = 0
df.loc[df["DeviceProtection"] == "No internet service", "DeviceProtection"] = 0
df.loc[df["DeviceProtection"] == "Yes", "DeviceProtection"] = 1

df.loc[df["TechSupport"] == "No", "TechSupport"] = 0
df.loc[df["TechSupport"] == "No internet service", "TechSupport"] = 0
df.loc[df["TechSupport"] == "Yes", "TechSupport"] = 1

df.loc[df["StreamingTV"] == "No", "StreamingTV"] = 0
df.loc[df["StreamingTV"] == "No internet service", "StreamingTV"] = 0
df.loc[df["StreamingTV"] == "Yes", "StreamingTV"] = 1

df.loc[df["StreamingMovies"] == "No", "StreamingMovies"] = 0
df.loc[df["StreamingMovies"] == "No internet service", "StreamingMovies"] = 0
df.loc[df["StreamingMovies"] == "Yes", "StreamingMovies"] = 1

df.loc[df["Contract"] == "Two year", "Contract"] = 0
df.loc[df["Contract"] == "One year", "Contract"] = 1
df.loc[df["Contract"] == "Month-to-month", "Contract"] = 2

df.loc[df["PaperlessBilling"] == "Yes", "PaperlessBilling"] = 1
df.loc[df["PaperlessBilling"] == "No", "PaperlessBilling"] = 0

df.loc[df["PaymentMethod"] == "Bank transfer (automatic)", "PaymentMethod"] = 0
df.loc[df["PaymentMethod"] == "Mailed check", "PaymentMethod"] = 1
df.loc[df["PaymentMethod"] == "Electronic check", "PaymentMethod"] = 2
df.loc[df["PaymentMethod"] == "Credit card (automatic)", "PaymentMethod"] = 3

df["TotalCharges"] = df["TotalCharges"].apply(pd.to_numeric, errors='coerce')
df['gender'] = df['gender'].astype(np.int64)
df['Partner'] = df['Partner'].astype(np.int64)
df['Dependents'] = df['Dependents'].astype(np.int64)
df['PhoneService'] = df['PhoneService'].astype(np.int64)
df['MultipleLines'] = df['MultipleLines'].astype(np.int64)
df['InternetService'] = df['InternetService'].astype(np.int64)
df['OnlineSecurity'] = df['OnlineSecurity'].astype(np.int64)
df['OnlineBackup'] = df['OnlineBackup'].astype(np.int64)
df['DeviceProtection'] = df['DeviceProtection'].astype(np.int64)
df['TechSupport'] = df['TechSupport'].astype(np.int64)
df['StreamingTV'] = df['StreamingTV'].astype(np.int64)
df['StreamingMovies'] = df['StreamingMovies'].astype(np.int64)
df['Contract'] = df['Contract'].astype(np.int64)
df['PaperlessBilling'] = df['PaperlessBilling'].astype(np.int64)
df['PaymentMethod'] = df['PaymentMethod'].astype(np.int64)
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
df['TotalCharges'] = (df['TotalCharges']-df['TotalCharges'].min())/(df['TotalCharges'].max()-df['TotalCharges'].min())
df['tenure'] = (df['tenure']-df['tenure'].min())/(df['tenure'].max()-df['tenure'].min())
df['MonthlyCharges'] = (df['MonthlyCharges']-df['MonthlyCharges'].min())/(df['MonthlyCharges'].max()-df['MonthlyCharges'].min())

df[-5:]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
5538,241896052,0,0,0,0,0.541667,1,0,1,0,...,0,1,0,0,0,1,0,0.31393,0.225284,0
5539,1917607,1,0,1,1,0.472222,1,0,1,1,...,1,0,0,0,1,1,1,0.423383,0.233465,0
5540,1651761,1,0,1,0,0.388889,1,0,0,0,...,0,0,0,0,2,0,1,0.022388,0.062798,0
5541,8238468,0,1,1,0,0.958333,1,1,2,0,...,1,0,0,1,2,1,0,0.767164,0.73427,0
5542,5531185,0,0,0,0,0.875,1,1,1,1,...,1,1,0,0,1,0,0,0.502985,0.472251,0


In [3]:
#Check types are numeric
df.dtypes

customerID            int64
gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure              float64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [4]:
#Check there are not missing values
df.isnull().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

# Train and test splitting

We use the same techniques we applied in the Iris dataset. 

Nevertheless, we need to remove the column 'Survived' 

In [5]:
# Features of the model
features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 
            'InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
            'Contract','PaperlessBilling','PaymentMethod','MonthlyCharges','TotalCharges']
# Transform dataframe in numpy arrays
X = df[features].values
y = df['Churn'].values
len(y)

5543

# Define model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
types_of_kernels = ['linear', 'rbf', 'poly']

kernel = types_of_kernels[0]
gamma = 3.0

# Create kNN model
model = SVC(kernel=kernel, probability=True, gamma=gamma)
model.fit(X_train, y_train)

predicted = model.predict(X_test)
expected = y_test
metrics.accuracy_score(expected, predicted)

0.8088023088023089

Ok, we get around 80% of accuracy!

In [7]:
dfTest = pd.read_csv('./data-churn/test_nolabel.csv')

In [8]:
dfTest.loc[dfTest["gender"] == "Male", "gender"] = 0
dfTest.loc[dfTest["gender"] == "Female", "gender"] = 1

dfTest.loc[dfTest["Partner"] == "Yes", "Partner"] = 1
dfTest.loc[dfTest["Partner"] == "No", "Partner"] = 0

dfTest.loc[dfTest["Dependents"] == "Yes", "Dependents"] = 1
dfTest.loc[dfTest["Dependents"] == "No", "Dependents"] = 0

dfTest.loc[dfTest["PhoneService"] == "Yes", "PhoneService"] = 1
dfTest.loc[dfTest["PhoneService"] == "No", "PhoneService"] = 0

dfTest.loc[dfTest["MultipleLines"] == "No", "MultipleLines"] = 0
dfTest.loc[dfTest["MultipleLines"] == "No phone service", "MultipleLines"] = 0
dfTest.loc[dfTest["MultipleLines"] == "Yes", "MultipleLines"] = 1

dfTest.loc[dfTest["InternetService"] == "No", "InternetService"] = 0
dfTest.loc[dfTest["InternetService"] == "DSL", "InternetService"] = 1
dfTest.loc[dfTest["InternetService"] == "Fiber optic", "InternetService"] = 2

dfTest.loc[dfTest["OnlineSecurity"] == "No", "OnlineSecurity"] = 0
dfTest.loc[dfTest["OnlineSecurity"] == "No internet service", "OnlineSecurity"] = 0
dfTest.loc[dfTest["OnlineSecurity"] == "Yes", "OnlineSecurity"] = 1

dfTest.loc[dfTest["OnlineBackup"] == "No", "OnlineBackup"] = 0
dfTest.loc[dfTest["OnlineBackup"] == "No internet service", "OnlineBackup"] = 0
dfTest.loc[dfTest["OnlineBackup"] == "Yes", "OnlineBackup"] = 1

dfTest.loc[dfTest["DeviceProtection"] == "No", "DeviceProtection"] = 0
dfTest.loc[dfTest["DeviceProtection"] == "No internet service", "DeviceProtection"] = 0
dfTest.loc[dfTest["DeviceProtection"] == "Yes", "DeviceProtection"] = 1

dfTest.loc[dfTest["TechSupport"] == "No", "TechSupport"] = 0
dfTest.loc[dfTest["TechSupport"] == "No internet service", "TechSupport"] = 0
dfTest.loc[dfTest["TechSupport"] == "Yes", "TechSupport"] = 1

dfTest.loc[dfTest["StreamingTV"] == "No", "StreamingTV"] = 0
dfTest.loc[dfTest["StreamingTV"] == "No internet service", "StreamingTV"] = 0
dfTest.loc[dfTest["StreamingTV"] == "Yes", "StreamingTV"] = 1

dfTest.loc[dfTest["StreamingMovies"] == "No", "StreamingMovies"] = 0
dfTest.loc[dfTest["StreamingMovies"] == "No internet service", "StreamingMovies"] = 0
dfTest.loc[dfTest["StreamingMovies"] == "Yes", "StreamingMovies"] = 1

dfTest.loc[dfTest["Contract"] == "Two year", "Contract"] = 0
dfTest.loc[dfTest["Contract"] == "One year", "Contract"] = 1
dfTest.loc[dfTest["Contract"] == "Month-to-month", "Contract"] = 2

dfTest.loc[dfTest["PaperlessBilling"] == "Yes", "PaperlessBilling"] = 1
dfTest.loc[dfTest["PaperlessBilling"] == "No", "PaperlessBilling"] = 0

dfTest.loc[dfTest["PaymentMethod"] == "Bank transfer (automatic)", "PaymentMethod"] = 0
dfTest.loc[dfTest["PaymentMethod"] == "Mailed check", "PaymentMethod"] = 1
dfTest.loc[dfTest["PaymentMethod"] == "Electronic check", "PaymentMethod"] = 2
dfTest.loc[dfTest["PaymentMethod"] == "Credit card (automatic)", "PaymentMethod"] = 3

dfTest["TotalCharges"] = dfTest["TotalCharges"].apply(pd.to_numeric, errors='coerce')
dfTest['gender'] = dfTest['gender'].astype(np.int64)
dfTest['Partner'] = dfTest['Partner'].astype(np.int64)
dfTest['Dependents'] = dfTest['Dependents'].astype(np.int64)
dfTest['PhoneService'] = dfTest['PhoneService'].astype(np.int64)
dfTest['MultipleLines'] = dfTest['MultipleLines'].astype(np.int64)
dfTest['InternetService'] = dfTest['InternetService'].astype(np.int64)
dfTest['OnlineSecurity'] = dfTest['OnlineSecurity'].astype(np.int64)
dfTest['OnlineBackup'] = dfTest['OnlineBackup'].astype(np.int64)
dfTest['DeviceProtection'] = dfTest['DeviceProtection'].astype(np.int64)
dfTest['TechSupport'] = dfTest['TechSupport'].astype(np.int64)
dfTest['StreamingTV'] = dfTest['StreamingTV'].astype(np.int64)
dfTest['StreamingMovies'] = dfTest['StreamingMovies'].astype(np.int64)
dfTest['Contract'] = dfTest['Contract'].astype(np.int64)
dfTest['PaperlessBilling'] = dfTest['PaperlessBilling'].astype(np.int64)
dfTest['PaymentMethod'] = dfTest['PaymentMethod'].astype(np.int64)
dfTest['TotalCharges'] = dfTest['TotalCharges'].fillna(dfTest['TotalCharges'].median())
dfTest['TotalCharges'] = (dfTest['TotalCharges']-dfTest['TotalCharges'].min())/(dfTest['TotalCharges'].max()-dfTest['TotalCharges'].min())
dfTest['tenure'] = (dfTest['tenure']-dfTest['tenure'].min())/(dfTest['tenure'].max()-dfTest['tenure'].min())
dfTest['MonthlyCharges'] = (dfTest['MonthlyCharges']-dfTest['MonthlyCharges'].min())/(dfTest['MonthlyCharges'].max()-dfTest['MonthlyCharges'].min())

dfTest.drop(['customerID'], axis=1, inplace=True)

dfTest[-5:]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
1495,1,0,1,1,0.027778,0,0,1,0,0,0,1,0,0,2,0,0,0.10402,0.003043
1496,1,1,1,0,0.069444,1,0,2,0,0,0,0,0,0,2,1,2,0.509045,0.03785
1497,1,0,0,0,0.638889,1,0,0,0,0,0,0,0,0,0,1,2,0.024121,0.107676
1498,0,1,0,0,0.291667,1,1,2,0,1,1,0,1,1,2,1,2,0.860804,0.264184
1499,0,0,1,1,0.944444,1,0,2,1,1,0,1,0,1,0,0,0,0.767839,0.781464


In [9]:
# set of parameters to test
tuned_parameters = [{'max_depth': np.arange(3, 10),
                     'criterion': ['gini', 'entropy'], 
                     'splitter': ['best', 'random'],
                     'min_samples_leaf': [2, 5, 10],
                     'class_weight':['balanced'],
                     'max_leaf_nodes': [None, 5, 10, 20]
                    }]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    # cv = the fold of the cross-validation cv, defaulted to 5
    gs = GridSearchCV(SVC(kernel=kernel, probability=True, gamma=gamma), tuned_parameters, cv=10, scoring='%s_weighted' % score)
    gs.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(gs.best_params_)
    print()
    print("Detailed classification report:")
    print()

    y_true, y_pred = y_test, gs.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision
()


ValueError: Invalid parameter splitter for estimator SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=3.0, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
model = SVC(kernel=kernel, probability=True, gamma=gamma)
model.fit(X, y)
churn = model.predict(dfTest)
dataframeResult = pd.read_csv('./data-churn/test_nolabel.csv').filter(['customerID'], axis=1)
dataframeResult['Churn'] = churn
dataframeResult[:]

In [None]:
dataframeResult.to_csv('outSVM.csv', sep=',',index=False)