In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/kaggle/input/customer-churn-prediction-dataset/customer_churn_dataset.csv')
df.head()

Unnamed: 0,customer_id,tenure,monthly_charges,total_charges,contract,payment_method,internet_service,tech_support,online_security,support_calls,churn
0,1,52,54.2,2818.4,Month-to-month,Credit,DSL,No,Yes,1,No
1,2,15,35.28,529.2,Month-to-month,Debit,DSL,No,No,2,No
2,3,72,78.24,5633.28,Month-to-month,Debit,DSL,No,No,0,No
3,4,61,80.24,4894.64,One year,Cash,Fiber,Yes,Yes,0,No
4,5,21,39.38,826.98,Month-to-month,UPI,Fiber,No,No,4,Yes


In [3]:
df.columns

Index(['customer_id', 'tenure', 'monthly_charges', 'total_charges', 'contract',
       'payment_method', 'internet_service', 'tech_support', 'online_security',
       'support_calls', 'churn'],
      dtype='object')

In [4]:
#axis = 1 => we decided to drop the column
df1 = df.drop('customer_id' , axis = 1)
df1.head()

Unnamed: 0,tenure,monthly_charges,total_charges,contract,payment_method,internet_service,tech_support,online_security,support_calls,churn
0,52,54.2,2818.4,Month-to-month,Credit,DSL,No,Yes,1,No
1,15,35.28,529.2,Month-to-month,Debit,DSL,No,No,2,No
2,72,78.24,5633.28,Month-to-month,Debit,DSL,No,No,0,No
3,61,80.24,4894.64,One year,Cash,Fiber,Yes,Yes,0,No
4,21,39.38,826.98,Month-to-month,UPI,Fiber,No,No,4,Yes


In [5]:
len(df1)

20000

In [6]:
df1.isna().sum()

tenure                 0
monthly_charges        0
total_charges          0
contract               0
payment_method         0
internet_service    2013
tech_support           0
online_security        0
support_calls          0
churn                  0
dtype: int64

In [7]:
df1['internet_service'].unique()

array(['DSL', 'Fiber', nan], dtype=object)

In [8]:
df1['internet_service'].value_counts()

internet_service
Fiber    10064
DSL       7923
Name: count, dtype: int64

In [9]:
#fill the missing values with 'Unknown'
df1['internet_service'] = df1['internet_service'].fillna('Unknown')

In [10]:
df1.isna().sum()

tenure              0
monthly_charges     0
total_charges       0
contract            0
payment_method      0
internet_service    0
tech_support        0
online_security     0
support_calls       0
churn               0
dtype: int64

In [11]:
df1.dtypes

tenure                int64
monthly_charges     float64
total_charges       float64
contract             object
payment_method       object
internet_service     object
tech_support         object
online_security      object
support_calls         int64
churn                object
dtype: object

In [12]:
print(df1['contract'].unique())
print(df1['payment_method'].unique())
print(df1['internet_service'].unique())
print(df1['tech_support'].unique())
print(df1['online_security'].unique())

['Month-to-month' 'One year' 'Two year']
['Credit' 'Debit' 'Cash' 'UPI']
['DSL' 'Fiber' 'Unknown']
['No' 'Yes']
['Yes' 'No']


**Data Encoding**

In [13]:
df2 = df1.copy()

In [14]:
#Integer Encoding
df2['contract'] = df2['contract'].map({'Month-to-month' : 0,
                                       'One year' : 1,
                                       'Two year' : 2})
df2['tech_support'] = df2['tech_support'].map({'No':0, 'Yes':1})
df2['online_security'] = df2['online_security'].map({'No':0, 'Yes':1})

In [15]:
#One - hot encoding
pm = pd.get_dummies(df2['payment_method'] , dtype = int)
isr = pd.get_dummies(df2['internet_service'] , dtype = int)

In [16]:
df2.head()

Unnamed: 0,tenure,monthly_charges,total_charges,contract,payment_method,internet_service,tech_support,online_security,support_calls,churn
0,52,54.2,2818.4,0,Credit,DSL,0,1,1,No
1,15,35.28,529.2,0,Debit,DSL,0,0,2,No
2,72,78.24,5633.28,0,Debit,DSL,0,0,0,No
3,61,80.24,4894.64,1,Cash,Fiber,1,1,0,No
4,21,39.38,826.98,0,UPI,Fiber,0,0,4,Yes


In [17]:
df3 = pd.concat([df2, pm , isr] , axis = 1).drop(['payment_method','internet_service'],
                                                 axis = 1)

In [18]:
df3.head()

Unnamed: 0,tenure,monthly_charges,total_charges,contract,tech_support,online_security,support_calls,churn,Cash,Credit,Debit,UPI,DSL,Fiber,Unknown
0,52,54.2,2818.4,0,0,1,1,No,0,1,0,0,1,0,0
1,15,35.28,529.2,0,0,0,2,No,0,0,1,0,1,0,0
2,72,78.24,5633.28,0,0,0,0,No,0,0,1,0,1,0,0
3,61,80.24,4894.64,1,1,1,0,No,1,0,0,0,0,1,0
4,21,39.38,826.98,0,0,0,4,Yes,0,0,0,1,0,1,0


**Define features and labels - X & y**

In [19]:
X = df3.drop('churn' , axis = 1)
y = df3['churn']

**Split the data into train & test**

In [20]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25)

**Import Algorithm**

In [21]:
from sklearn.linear_model import LogisticRegression
model_log = LogisticRegression()
model_log.fit(xtrain, ytrain)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
pd.DataFrame(model_log.coef_,columns = xtrain.columns)

Unnamed: 0,tenure,monthly_charges,total_charges,contract,tech_support,online_security,support_calls,Cash,Credit,Debit,UPI,DSL,Fiber,Unknown
0,-0.030943,0.010379,0.000252,-0.79322,-1.285131,-0.036977,0.37135,-0.122051,-0.081497,-0.114176,-0.102866,-0.200665,-0.191988,-0.027938


In [23]:
model_log.predict(xtrain)

array(['Yes', 'No', 'Yes', ..., 'No', 'No', 'No'], dtype=object)

In [24]:
model_log.predict(xtest)

array(['Yes', 'No', 'No', ..., 'No', 'No', 'Yes'], dtype=object)

In [25]:
from sklearn.metrics import accuracy_score
ytainpred = model_log.predict(xtrain)
ypred = model_log.predict(xtest)
acc2 = accuracy_score(ytrain,ytainpred)
acc = accuracy_score(ytest, ypred)
print("Accuracy:", acc)
print("Accuracy :",acc2)


Accuracy: 0.7728
Accuracy : 0.7716666666666666


In [26]:
(ytrain == ytainpred).sum()/len(xtrain)

np.float64(0.7716666666666666)