In [1]:
#import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#read the csv file
df=pd.read_csv("churn_data.csv")

In [3]:
df.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Before further processing the dataset, first check if the dataset contains null values or not

In [4]:
df.isnull().sum()

customerID          0
tenure              0
PhoneService        0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
df.dtypes

customerID           object
tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Here we see that the customerID column is irrelevant to the target i.e. whether that customer has churned or not. So we drop customerID column

In [6]:
df.drop('customerID',axis=1,inplace=True)

Churn column is of object data type so for further processing we have to convert to interger data type

In [7]:
df['Churn']=df['Churn'].replace({'No':0,'Yes':1})

In [8]:
df['Churn'].value_counts()

Churn
0    5173
1    1869
Name: count, dtype: int64

In [9]:
df.dtypes

tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

Total Charges as the name suggests should be of numeric data type

In [10]:
df['TotalCharges']=pd.to_numeric(df.TotalCharges,errors='coerce')

In [11]:
df['PhoneService']=df['PhoneService'].replace({'No':0,'Yes':1})
df['PaperlessBilling']=df['PaperlessBilling'].replace({'No':0,'Yes':1})

Convert categorical columns into numerical columns for further processing

In [27]:
new_df1=pd.get_dummies(df)
new_df1.shape

(7042, 13)

In [29]:
new_df1=new_df1.dropna()
new_df1.isnull().sum()

tenure                                     0
PhoneService                               0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
Churn                                      0
Contract_Month-to-month                    0
Contract_One year                          0
Contract_Two year                          0
PaymentMethod_Bank transfer (automatic)    0
PaymentMethod_Credit card (automatic)      0
PaymentMethod_Electronic check             0
PaymentMethod_Mailed check                 0
dtype: int64

new_df1.shape

### RFE is a feature selection technique that removes the least important features iteratively to find the most significant subset.

In [30]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [31]:
#perform split
X=new_df1.drop(columns=['Churn'],axis=1)
y=new_df1['Churn']

In Recursive Feature Elimination (RFE), the **estimator** is the machine learning model that is used to determine feature importance and recursively remove the least important features.  
#### We use Decision Tree Classifier as the estimator as it is good for both numerical and categorical data.

In [56]:
rfe=RFE(estimator=DecisionTreeClassifier(),n_features_to_select=8)
X_rfe_selected = rfe.fit_transform(X, y)

In [57]:
# Get selected feature names
selected_rfe_features = X.columns[rfe.support_]
# Create a new DataFrame with selected features
df_rfe_selected = pd.DataFrame(X_rfe_selected, columns=selected_rfe_features)
df_rfe_selected["Chrun"] = y

In [58]:
df_rfe_selected.head()

Unnamed: 0,tenure,PaperlessBilling,MonthlyCharges,TotalCharges,Contract_Month-to-month,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,Chrun
0,1.0,1.0,29.85,29.85,1.0,0.0,0.0,1.0,0.0
1,34.0,0.0,56.95,1889.5,0.0,0.0,0.0,0.0,0.0
2,2.0,1.0,53.85,108.15,1.0,0.0,0.0,0.0,1.0
3,45.0,0.0,42.3,1840.75,0.0,1.0,0.0,0.0,0.0
4,2.0,1.0,70.7,151.65,1.0,0.0,0.0,1.0,1.0
