# Problem Statement 1

### Importing all required libraries

In [106]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix,recall_score,precision_score,f1_score,roc_auc_score,roc_curve,accuracy_score
from sklearn.preprocessing import LabelEncoder

### read data file with all default columns

In [91]:
train_df=pd.read_csv('dataset.csv',index_col='customerID')

## Exploratory Data Analysis

In [92]:
#informatin about all columns
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   StreamingService  7043 non-null   object 
 9   Contract          7043 non-null   object 
 10  PaperlessBilling  7043 non-null   object 
 11  PaymentMethod     7043 non-null   object 
 12  MonthlyCharges    7043 non-null   float64
 13  TotalCharges      7043 non-null   object 
 14  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(12)
memory usage: 880.4+ KB


In [93]:
#Apart from tenure,MonthlyCharges,TotalCharges all are categorical columns
train_df[train_df['TotalCharges']==' ']

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,StreamingService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Two year,Yes,Bank transfer (automatic),52.55,,No
3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,Two year,No,Mailed check,20.25,,No
5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Two year,No,Mailed check,80.85,,No
4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,Two year,No,Mailed check,25.75,,No
1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Two year,No,Credit card (automatic),56.05,,No
7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,Two year,No,Mailed check,19.85,,No
3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,Two year,No,Mailed check,25.35,,No
2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,Two year,No,Mailed check,20.0,,No
2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,One year,Yes,Mailed check,19.7,,No
4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,Yes,Two year,No,Mailed check,73.35,,No


In [94]:
#impute total charges where empty string is pesent
train_df['TotalCharges'] = np.where(train_df['TotalCharges'] == ' ', train_df['MonthlyCharges'], train_df['TotalCharges'])

In [95]:
#convert all categorical to numerical using labelencoder

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
train_df['gender'] = labelencoder.fit_transform(train_df['gender'])
train_df['SeniorCitizen'] = labelencoder.fit_transform(train_df['SeniorCitizen'])
train_df['Partner'] = labelencoder.fit_transform(train_df['Partner'])
train_df['Dependents'] = labelencoder.fit_transform(train_df['Dependents'])
train_df['PhoneService'] = labelencoder.fit_transform(train_df['PhoneService'])
train_df['MultipleLines'] = labelencoder.fit_transform(train_df['MultipleLines'])
train_df['InternetService'] = labelencoder.fit_transform(train_df['InternetService'])
train_df['StreamingService'] = labelencoder.fit_transform(train_df['StreamingService'])
train_df['Contract'] = labelencoder.fit_transform(train_df['Contract'])
train_df['PaperlessBilling'] = labelencoder.fit_transform(train_df['PaperlessBilling'])
train_df['PaymentMethod'] = labelencoder.fit_transform(train_df['PaymentMethod'])
train_df['Churn'] = labelencoder.fit_transform(train_df['Churn'])
train_df['TotalCharges']=train_df['TotalCharges'].astype('float64')

In [96]:
train_df = pd.get_dummies(train_df, columns = ['gender','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines',
                                                   'InternetService','StreamingService','Contract','PaperlessBilling','PaymentMethod',
                                                  ])

In [97]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7043 entries, 7590-VHVEG to 3186-AJIEK
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tenure              7043 non-null   int64  
 1   MonthlyCharges      7043 non-null   float64
 2   TotalCharges        7043 non-null   float64
 3   Churn               7043 non-null   int32  
 4   gender_0            7043 non-null   uint8  
 5   gender_1            7043 non-null   uint8  
 6   SeniorCitizen_0     7043 non-null   uint8  
 7   SeniorCitizen_1     7043 non-null   uint8  
 8   Partner_0           7043 non-null   uint8  
 9   Partner_1           7043 non-null   uint8  
 10  Dependents_0        7043 non-null   uint8  
 11  Dependents_1        7043 non-null   uint8  
 12  PhoneService_0      7043 non-null   uint8  
 13  PhoneService_1      7043 non-null   uint8  
 14  MultipleLines_0     7043 non-null   uint8  
 15  MultipleLines_1     7043 non-null   uint8  
 

In [98]:
#Separate independent features and dependent features
X=train_df.loc[:, df.columns != 'Churn']
y=train_df['Churn'].ravel()

In [74]:
#Convert dependent featue(label or output into numerical binary feature)
y=np.where(y == 'Yes', 1, 0).astype('float64')

In [99]:
#split the traing data in training and validation set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(5634, 31) (5634,)
(1409, 31) (1409,)


In [100]:
model_log_reg=LogisticRegression()

In [80]:
type(y)

numpy.ndarray

In [101]:
model_log_reg.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [103]:
y_pred=model_log_reg.predict(X_test)

In [104]:
confusion_matrix(y_test, y_pred)

array([[929, 112],
       [184, 184]], dtype=int64)

In [107]:
print(roc_auc_score(y_test, y_pred))
print(recall_score(y_pred, y_test))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

0.696205571565802
0.6216216216216216
0.6216216216216216
0.5542168674698795
0.7899219304471257


In [59]:
y_pred=clf.predict(X_test)

In [61]:
confusion_matrix(y_test, y_pred)

array([[961,  80],
       [218, 150]], dtype=int64)

In [72]:
print(roc_auc_score(y_test, y_pred))
print(recall_score(y_pred, y_test))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))

0.6653797560873741
0.6521739130434783
0.6521739130434783
0.5016722408026757
