# Lab | Cross Validation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier



In [2]:
churnData = pd.read_csv("files_for_lab/Customer-Churn.csv")

In [3]:
churnData["TotalCharges"][churnData["TotalCharges"] == " "] = np.nan
churnData["TotalCharges"] = pd.to_numeric(churnData["TotalCharges"])
churnData["TotalCharges"] = churnData["TotalCharges"].fillna(churnData["TotalCharges"].mean())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  churnData["TotalCharges"][churnData["TotalCharges"] == " "] = np.nan


In [4]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

In [5]:
X = churnData[["tenure", "SeniorCitizen", "MonthlyCharges", "TotalCharges"]]
y = churnData.iloc[:,-1]
y = y.apply(lambda x: 1 if x=='Yes' else 0)

sc = StandardScaler()
lr = LogisticRegression()

X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.30)

## Apply SMOTE for upsampling the data

* Use logistic regression to fit the model and compute the accuracy of the model.
* Use decision tree classifier to fit the model and compute the accuracy of the model.
* Compare the accuracies of the two models.

In [6]:
sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)
lr.fit(X_train_SMOTE, y_train_SMOTE)

In [7]:
print("train:", lr.score(X_train_SMOTE, y_train_SMOTE))
print("test:", lr.score(X_test, y_test))

train: 0.7277854195323247
test: 0.7397065783246569


In [8]:
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train_SMOTE, y_train_SMOTE)

In [9]:
print("train:", clf.score(X_train_SMOTE, y_train_SMOTE))
print("test:", clf.score(X_test, y_test))

train: 0.99353507565337
test: 0.706578324656886


# Apply TomekLinks for downsampling

* It is important to remember that it does not make the two classes equal but only removes the points from the majority class * * that are close to other points in minority class.
* Use logistic regression to fit the model and compute the accuracy of the model.
* Use decision tree classifier to fit the model and compute the accuracy of the model.
* Compare the accuracies of the two models.
* You can also apply this algorithm one more time and check how the imbalance in the two classes changed from the last time.

In [10]:
tl = TomekLinks(sampling_strategy='auto')
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)
lr.fit(X_train_tl, y_train_tl)

In [11]:
print("train:", lr.score(X_train_tl, y_train_tl))
print("test:", lr.score(X_test, y_test))

train: 0.7903048914235578
test: 0.7841930903928065


In [12]:
clf = clf.fit(X_train_tl, y_train_tl)

In [13]:
print("train:", clf.score(X_train_tl, y_train_tl))
print("test:", clf.score(X_test, y_test))

train: 0.9901294143452511
test: 0.718409843823947


In [14]:
X_train_tl, y_train_tl = tl.fit_resample(X_train_tl, y_train_tl)
lr.fit(X_train_tl, y_train_tl)

In [15]:
print("train:", lr.score(X_train_tl, y_train_tl))
print("test:", lr.score(X_test, y_test))

train: 0.789224526600541
test: 0.7827733080927591


DecisionTreeClassifier suffers heavily from overfitting and results in worse scores on the test data. Applying TomekLinks a second time lowers scores very slightly.