In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#1 Load the dataset and explore the variables
customers=pd.read_csv('customer_churn.csv')
customers.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
#2 We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.
customers = customers[['SeniorCitizen','tenure','MonthlyCharges','Churn']]
customers

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn
0,0,1,29.85,No
1,0,34,56.95,No
2,0,2,53.85,Yes
3,0,45,42.30,No
4,0,2,70.70,Yes
...,...,...,...,...
7038,0,24,84.80,No
7039,0,72,103.20,No
7040,0,11,29.60,No
7041,1,4,74.40,Yes


In [None]:
customers.isna().sum()

In [None]:
cols = []
for c in customers.columns:
    cols.append(c.lower())
customers.columns = cols

In [None]:
#3 Split the Dataset into X ('tenure', 'SeniorCitizen', 'MonthlyCharges') and y ('Churn')
def transform_churn(x):
    if x == 'Yes':
        return 1
    else:
        return 0

customers['churn'] = list(map(transform_churn,customers['churn']))
customers.head()

In [None]:
X = customers.drop(['churn'], axis=1)
y = customers['churn']

In [None]:
correlations_matrix = X.corr()
sns.heatmap(correlations_matrix, annot=True)
plt.show()

In [None]:
balance_target = pd.value_counts(customers['churn'])
balance_target.plot(kind = 'bar')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#4 Build the logistic regression model.
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train)
X_std_train = transformer.transform(X_train)
X_std_train = pd.DataFrame(X_std_train,columns=X.columns)

X_std_test = transformer.transform(X_test)
X_std_test = pd.DataFrame(X_std_test,columns=X.columns)

In [None]:
from sklearn.linear_model import LogisticRegression

classification = LogisticRegression(random_state =0, solver='lbfgs').fit(X_std_train,y_train)
classification.score(X_std_test, y_test)

In [None]:
#5 Evaluate the model.
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

predictions = classification.predict(X_std_test)

print("precision: ",precision_score(y_test,predictions))
print("recall: ",recall_score(y_test,predictions))
print("f1: ",f1_score(y_test,predictions))

In [None]:
#6 Even a simple model will give us more than 70% accuracy. Why?
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,predictions)

In [None]:
#7 Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors that adds new points
#between existing points. Apply imblearn.over_sampling.SMOTE to the dataset.
#Build and evaluate the logistic regression model. Is it there any improvement?
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_std_train,y_train

In [None]:
class_smote = LogisticRegression(random_state=0,solver='lbfgs').fit(X_train_SMOTE,y_train_SMOTE)
class_smote.score(X_std_test, y_test)

In [None]:
pred_smote = class_smote.predict(X_std_test)

print("precision: ",precision_score(y_test,pred_smote))
print("recall: ",recall_score(y_test,pred_smote))
print("f1: ",f1_score(y_test,pred_smote))

In [None]:
pred_smote

In [None]:
confusion_matrix(y_test,pred_smote)

In [None]:
431/(431+143)

In [None]:
recall went up