### **IMPORT LIBRARY**

In [46]:
import numpy as np
import pandas as pd


### **IMPORT DATASET**

In [47]:
df = pd.read_csv("churn.csv")

In [48]:
df.head(1)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No


### **DATA PREPROCESSING**

In [49]:
df.shape

(7043, 21)

In [50]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### **IMPORTANT COLUMN**

In [51]:
columns_to_keep = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'Contract', 'TotalCharges', 'Churn']

In [52]:
df = df[columns_to_keep]

df.head(1)

### **ENCODING BINARY VARIABLE**

In [53]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'Contract', 'Churn']

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

In [54]:
df.head(1)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,Contract,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,29.85,0


### **SPLIT THE DATA INTO TRAINING AD TESTING DATA**

In [55]:
X = df.drop('Churn', axis=1)
y= df['Churn']

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state = 42)

In [57]:
X_train.shape

(5634, 9)

In [58]:
X_test.shape

(1409, 9)

In [59]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5634 entries, 2142 to 860
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         5634 non-null   int32 
 1   SeniorCitizen  5634 non-null   int64 
 2   Partner        5634 non-null   int32 
 3   Dependents     5634 non-null   int32 
 4   tenure         5634 non-null   int64 
 5   PhoneService   5634 non-null   int32 
 6   MultipleLines  5634 non-null   int32 
 7   Contract       5634 non-null   int32 
 8   TotalCharges   5634 non-null   object
dtypes: int32(6), int64(2), object(1)
memory usage: 308.1+ KB


In [60]:
X_train['TotalCharges'] = pd.to_numeric(X_train['TotalCharges'], errors='coerce')
X_test['TotalCharges'] = pd.to_numeric(X_test['TotalCharges'], errors = 'coerce')

In [61]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5634 entries, 2142 to 860
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         5634 non-null   int32  
 1   SeniorCitizen  5634 non-null   int64  
 2   Partner        5634 non-null   int32  
 3   Dependents     5634 non-null   int32  
 4   tenure         5634 non-null   int64  
 5   PhoneService   5634 non-null   int32  
 6   MultipleLines  5634 non-null   int32  
 7   Contract       5634 non-null   int32  
 8   TotalCharges   5624 non-null   float64
dtypes: float64(1), int32(6), int64(2)
memory usage: 308.1 KB


In [62]:
X_train.isnull().sum()

gender            0
SeniorCitizen     0
Partner           0
Dependents        0
tenure            0
PhoneService      0
MultipleLines     0
Contract          0
TotalCharges     10
dtype: int64

In [63]:
X_train['TotalCharges'].fillna(X_train['TotalCharges'].mean(), inplace=True)
X_test['TotalCharges'].fillna(X_test['TotalCharges'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train['TotalCharges'].fillna(X_train['TotalCharges'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test['TotalCharges'].fillna(X_test['TotalCharges'].mean(), inplace=True)


In [64]:
X_train.isnull().sum()

gender           0
SeniorCitizen    0
Partner          0
Dependents       0
tenure           0
PhoneService     0
MultipleLines    0
Contract         0
TotalCharges     0
dtype: int64

### **Standardize Feature**

In [65]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
X_train

array([[-1.02516569, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.42210502],
       [-1.02516569, -0.4377492 , -0.96957859, ...,  1.10833901,
         1.5775905 ,  1.25536015],
       [ 0.97545208, -0.4377492 ,  1.03137591, ...,  0.05390099,
        -0.83177379, -1.00299144],
       ...,
       [ 0.97545208, -0.4377492 ,  1.03137591, ..., -1.00053704,
        -0.83177379, -0.87799925],
       [ 0.97545208,  2.28441306, -0.96957859, ...,  1.10833901,
        -0.83177379, -0.48254445],
       [ 0.97545208, -0.4377492 , -0.96957859, ..., -1.00053704,
         0.37290835, -0.81110232]])

### **LOGISTIC REGRESSION**

In [67]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)

In [68]:
y_pred

array([1, 0, 0, ..., 0, 0, 1])

### **Accuracy Score**

In [69]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7757274662881476

In [70]:
import pickle

# Save the model safely using a context manager
with open('Logistic_regression.pkl', 'wb') as f:
    pickle.dump(lg, f)


### **CLASSIFICATION SYSTEM**

In [71]:
def predictive(gender, Seniorcitizen, Partner, Dependents, tenure, Phoneservice, multiline, contact, totalcharge):
    data={
        'gender':[gender],
        'Seniorcitizen':[Seniorcitizen],
        'Partner':[Partner],
        'Dependents':[Dependents],
        'tenure':[tenure],
        'Phoneservice':[Phoneservice],
        'multiline':[multiline],
        'contact':[contact],
        'totalcharge':[totalcharge]
    }
    df1= pd.DataFrame(data)
    categorical_columns = ['gender', 'Seniorcitizen', 'Partner', 'Dependents', 'Phoneservice', 'multiline', 'contact','totalcharge']
    for column in categorical_columns:
        df1[column] = label_encoder.fit_transform(df1[column])

    df1 = scaler.fit_transform(df1)
    result = lg.predict(df1).reshape(1,-1)
    return result[0]

In [72]:
gender = "Female"
Seniorcitizen ="No"
Partner = "Yes"
Dependents = "No"
tenure = 1
Phoneservice = "No"
multiline = "No phone Service"
contact = "Month-to-month"
totalcharge = 29.85
result=predictive(gender,Seniorcitizen,Partner,Dependents, tenure,Phoneservice,multiline, contact, totalcharge)
if result ==0:
    print("Not churn")
else:
    print("Churn")

Not churn
