### Import required libraries

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SequentialFeatureSelector

### Read the data

In [2]:
churn=pd.read_csv("churn_dataset.csv")
churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
churn.shape

(7043, 21)

In [4]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
churn.duplicated().sum()

0

In [6]:
churn.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [7]:
churn=churn[churn["TotalCharges"] != " "]
churn["TotalCharges"]=churn["TotalCharges"].astype(float)

### Label Encoding

In [10]:
for i in churn.select_dtypes(include="object").columns:
    le=LabelEncoder()
    churn[i]=le.fit_transform(churn[i])

### Selecting Important features

In [83]:
imp_features=['gender', 'SeniorCitizen', 'PhoneService', 'MultipleLines',
       'StreamingTV', 'StreamingMovies', 'PaperlessBilling',
       'MonthlyCharges', 'TotalCharges']

### Separate the data

In [62]:
X=churn.drop(["Churn","customerID","Partner","Dependents","tenure","InternetService",
              "OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","Contract",
             "PaymentMethod"], axis=1)
y=churn["Churn"]

### Split the data into training and testing

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Random Forest 

In [64]:
rf=RandomForestClassifier(max_depth=5, class_weight='balanced',random_state=42)

In [65]:
rf.fit(X_train, y_train)

In [66]:
rf_pred_train=rf.predict(X_train)
rf_pred_test=rf.predict(X_test)

In [67]:
print("training accuracy - ",accuracy_score(y_train,rf_pred_train))
print("testing accuracy - ",accuracy_score(y_test,rf_pred_test))

training accuracy -  0.8525
testing accuracy -  0.77


In [68]:
print("training precision - ",precision_score(y_train,rf_pred_train))
print("testing precision - ",precision_score(y_test,rf_pred_test))

training precision -  0.6788990825688074
testing precision -  0.5416666666666666


In [69]:
print("training precision - ",classification_report(y_train,rf_pred_train))
print("testing precision - ",classification_report(y_test,rf_pred_test))

training precision -                precision    recall  f1-score   support

           0       0.92      0.88      0.90       302
           1       0.68      0.76      0.71        98

    accuracy                           0.85       400
   macro avg       0.80      0.82      0.81       400
weighted avg       0.86      0.85      0.86       400

testing precision -                precision    recall  f1-score   support

           0       0.84      0.85      0.85        75
           1       0.54      0.52      0.53        25

    accuracy                           0.77       100
   macro avg       0.69      0.69      0.69       100
weighted avg       0.77      0.77      0.77       100



### Store the pkl file

In [70]:
import joblib
with open("churn_model1.pkl", "wb") as f:
    joblib.dump(rf,f)

### Load the pkl file

In [71]:
with open("churn_model1.pkl", "rb") as f:
    rf_model=joblib.load(f)

### Predict the sample data

In [80]:
sample=[[0,0,1,0,2,2,0,95.40,293.15]]
sample1=[[1,1,0,1,2,2,1,54.45,2854.55]]
df=pd.DataFrame(sample1, columns=X_train.columns)

In [81]:
rf_model.predict(df)

array([1])