In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# 1. Data Cleaning

In [7]:
df.loc[df["TotalCharges"].eq(" "), "TotalCharges"] = 0
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [12]:
categorical = ["Contract", "InternetService"]
to_bool = [col for col in df.nunique()[df.nunique().eq(3)].index if col not in categorical]

In [13]:
df[to_bool] = np.where(df[to_bool].eq("Yes"), 1, 0)

In [17]:
df = (df
      .assign(female=df["gender"].eq("Female").astype(int))
      .drop(columns="gender"))

In [20]:
to_bool_as_well = ["Partner", "Dependents", "PhoneService", "Churn", "PaperlessBilling"]
df[to_bool_as_well] = np.where(df[to_bool_as_well].eq("Yes"), 1, 0)

In [21]:
df.dtypes

customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
female                int64
dtype: object

In [22]:
df.head()

Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,female
0,7590-VHVEG,0,1,0,1,0,0,DSL,0,1,...,0,0,0,Month-to-month,1,Electronic check,29.85,29.85,0,1
1,5575-GNVDE,0,0,0,34,1,0,DSL,1,0,...,0,0,0,One year,0,Mailed check,56.95,1889.5,0,0
2,3668-QPYBK,0,0,0,2,1,0,DSL,1,1,...,0,0,0,Month-to-month,1,Mailed check,53.85,108.15,1,0
3,7795-CFOCW,0,0,0,45,0,0,DSL,1,0,...,1,0,0,One year,0,Bank transfer (automatic),42.3,1840.75,0,0
4,9237-HQITU,0,0,0,2,1,0,Fiber optic,0,0,...,0,0,0,Month-to-month,1,Electronic check,70.7,151.65,1,1


## 2. Create a function that we can use to quickly test different
- feature sets
- rescaling methods for numerical features
- ML algorithms

In [26]:
target_col = ["Churn"]
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

In [27]:
def evaluate_model(df, target_col, num_cols, cat_cols, bool_cols, algo, test_size=.1, scaler=None, seed=3):
    """
    This is what the function does.
    
    args:
    df: pd.DataFrame ....
    
    use this space to explain to other people what you are doing here
    """
    num_cols_copy = num_cols.copy()
    
    # split dataset
    train, test = train_test_split(df, test_size=test_size, random_state=seed)
    train = train.reset_index()
    test = test.reset_index()
    
    # apply rescaling if requested
    if scaler:
        # train scaler
        scaled_fitted = scaler.fit(train[num_cols_copy])
        
        # rescale train
        train_scaled_num_cols = pd.DataFrame(scaled_fitted.transform(train[num_cols_copy]),
                                             columns=["scaled_"+col for col in num_cols_copy])
        train = train.merge(train_scaled_num_cols, left_index=True, right_index=True, how="inner")
        
        # rescale test
        test_scaled_num_cols = pd.DataFrame(scaled_fitted.transform(test[num_cols_copy]),
                                            columns=["scaled_"+col for col in num_cols_copy])
        test = test.merge(test_scaled_num_cols, left_index=True, right_index=True, how="inner")
        
        # overwrite num_cols
        num_cols_copy = ["scaled_"+col for col in num_cols_copy]
        
    # create dummies
    train = pd.get_dummies(data=train,
                           columns=cat_cols,
                           drop_first=True)
    test = pd.get_dummies(data=test,
                          columns=cat_cols,
                          drop_first=True)
    
    dummy_cols = [col for categ in cat_cols for col in train.columns if col.startswith(categ)]
    
    # create input sets
    train_X = train[bool_cols+dummy_cols+num_cols_copy]
    train_y = train[target_col]
    test_X = test[bool_cols+dummy_cols+num_cols_copy]
    test_y = test[target_col]
    
    # train model
    model = algo
    model.fit(train_X, train_y.values.ravel())
    
    # predict
    predictions = model.predict(test_X)
    
    return accuracy_score(test_y, predictions)

In [36]:
target_col = ["Churn"]
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
bool_cols = [col for col in df.nunique()[df.nunique().eq(2)].index if col not in target_col]

In [59]:
evaluate_model(df=df,
               target_col=target_col,
               num_cols=num_cols,
               cat_cols=["InternetService", "Contract", "PaymentMethod"],
               bool_cols=bool_cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8141843971631205

**Apply dict unpacking**

In [38]:
cols = {"target_col": target_col,
        "num_cols": num_cols,
        "cat_cols": ["InternetService", "Contract", "PaymentMethod"],
        "bool_cols": bool_cols}

In [39]:
evaluate_model(df=df,
               **cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8141843971631205

### Compare to our previous model (from week 6)

In [41]:
df_p = df.copy()

services = ["PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df_p = (df_p
        .assign(no_of_services=df_p[services].sum(axis=1))
        .assign(AutomaticPayment=np.where(df_p["PaymentMethod"].str.contains("automatic"), 1, 0)))

# variables = ["no_of_services",
#              "AutomaticPayment",
#              "tenure",
#              "MonthlyCharges",
#              "SeniorCitizen",
#              "OnlineBackup",
#              "Contract_One year",
#              "Contract_Two year"]

cols_p = {"target_col": target_col,
          "num_cols": ["no_of_services", "tenure", "MonthlyCharges"],
          "cat_cols": ["Contract"],
          "bool_cols": ["AutomaticPayment", "SeniorCitizen", "OnlineBackup"]}

evaluate_model(df=df_p,
               **cols_p,
               algo=LogisticRegression(solver="liblinear"),
               scaler=StandardScaler())

0.8042553191489362

### Use MinMaxScaler()

In [42]:
from sklearn.preprocessing import MinMaxScaler

evaluate_model(df=df,
               **cols,
               algo=LogisticRegression(solver="liblinear"),
               scaler=MinMaxScaler())

0.8156028368794326

### Try DecisionTree

In [65]:
from sklearn.tree import DecisionTreeClassifier

evaluate_model(df=df,
               **cols,
               algo=DecisionTreeClassifier(random_state=666),
               scaler=MinMaxScaler())

0.7460992907801418

In [64]:
evaluate_model(df=df,
               **cols,
               algo=DecisionTreeClassifier(random_state=666))

0.7446808510638298

In [66]:
from sklearn.ensemble import RandomForestClassifier

  return f(*args, **kwds)


In [71]:
evaluate_model(df=df,
               **cols,
               algo=RandomForestClassifier(random_state=213))

0.8056737588652483