In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [24]:
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head().T
# data.info()
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
# data[total_charges.isnull()][['customerID', 'TotalCharges']]
data.TotalCharges = data.TotalCharges.fillna(0)
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [25]:
data.columns = data.columns.str.lower().str.replace(' ','_')
string_columns = list(data.dtypes[data.dtypes == 'object'].index)
for col in string_columns:
    data[col]= data[col].str.lower().str.replace(' ','_')
data.churn = (data.churn == 'yes').astype(int)
print(data.churn.value_counts())
data_train_full, data_test, = train_test_split(data, test_size = 0.2, random_state = 1)
data_train, data_val = train_test_split(data_train_full, test_size = 0.2, random_state = 11)

y_train = data_train.churn.values
y_val = data_val.churn.values
y_test = data_test.churn.values

0    5174
1    1869
Name: churn, dtype: int64


In [77]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 
              'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 
               'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

train_dict = data_train[categorical+numerical].to_dict(orient='rows')
val_dict = data_val[categorical+numerical].to_dict(orient='rows')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

  train_dict = data_train[categorical+numerical].to_dict(orient='rows')
  val_dict = data_val[categorical+numerical].to_dict(orient='rows')


### LASSO REGRESSION

In [69]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)
param = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
lr_model = LogisticRegression(penalty='l1', solver = 'liblinear')
gs_model = GridSearchCV(estimator=lr_model, param_grid=param)
gs_model.fit(X_std, y_train)

model = LogisticRegression(**gs_model.best_params_, penalty='l1', solver='liblinear')
model.fit(X_std, y_train)
coef = model.coef_[0]
imp = dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))
model = LogisticRegression(**gs_model.best_params_, penalty='l1', solver='liblinear')
model.fit(X_std, y_train)
coef = model.coef_[0]
imp = dict(zip(dv.get_feature_names(), model.coef_[0].round(3)))

y_pred = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
print(auc)

0.4883574702851811


In [84]:
fs = SelectKBest(score_func=f_classif, k = 'all')
X_selected = fs.fit_transform(X_train, y_train)
for c in [0.001, 0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(random_state=1, solver = 'liblinear', C=c)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, y_pred)
    print('%0.4f = %0.3f'%(c,auc))

0.0010 = 0.812
0.0100 = 0.822
0.1000 = 0.823
1.0000 = 0.823
10.0000 = 0.823
100.0000 = 0.822


In [90]:
forestmodel = RandomForestClassifier(random_state=0, max_depth=5,  min_samples_leaf=100)
forestmodel.fit(X_train, y_train)
y_pred = forestmodel.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print(auc)

0.8163705326355928


In [None]:
X