In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
df = pd.read_csv("telecom_churn.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [4]:
df.drop(columns="Unnamed: 0", axis=1, inplace=True)

In [5]:
target = "Churn"
X = df.drop(columns=target)
y = df[target]

In [6]:
X

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

# Decision Tree Classifier

In [8]:
sm = SMOTEENN()
X_resam, y_resam = sm.fit_resample(X, y)

In [9]:
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_resam, y_resam, test_size=0.2, random_state=42)

In [10]:
print(X.shape)
print(y.shape)
print(X_resam.shape)
print(y_resam.shape)

(7032, 50)
(7032,)
(5931, 50)
(5931,)


In [11]:
max_depth = range(5, 31, 5)
max_features = ["auto", "sqrt"]
min_samples_split=[2, 5, 10, 25, 50, 100]
min_samples_leaf = [1, 2, 5, 10]

In [12]:
random_grid_dtc = {"max_depth": max_depth,
               "max_features": max_features,
               "min_samples_split": min_samples_split,
               "min_samples_leaf": min_samples_leaf
              }

In [13]:
reg_dtc=DecisionTreeClassifier()

In [14]:
model_dtc = RandomizedSearchCV(
    estimator=reg_dtc, 
    param_distributions=random_grid_dtc,
    n_iter=10,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [15]:
model_dtc.fit(Xr_train, yr_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [16]:
y_pred = model_dtc.predict(Xr_test)
y_pred

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [17]:
model_dtc.score(Xr_test, yr_test)

0.9039595619208087

In [18]:
print(classification_report(yr_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.90      0.89      0.89       545
           1       0.91      0.92      0.91       642

    accuracy                           0.90      1187
   macro avg       0.90      0.90      0.90      1187
weighted avg       0.90      0.90      0.90      1187



# Random Forest Classifier

In [19]:
n_estimators = range(100, 1201, 100)
max_depth = range(5, 31, 5)
max_features = ["auto", "sqrt"]
min_samples_split=[2, 5, 10, 25, 50, 100]
min_samples_leaf = [1, 2, 5, 10]


In [20]:
random_grid_rfc = {"n_estimators": n_estimators,
               "max_depth": max_depth,
               "max_features": max_features,
               "min_samples_split": min_samples_split,
               "min_samples_leaf": min_samples_leaf
              }

In [21]:
reg_rfc = RandomForestClassifier()

In [22]:
model_rfc = RandomizedSearchCV(
    estimator=reg_rfc, 
    param_distributions=random_grid_rfc,
    n_iter=10,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [23]:
model_rfc.fit(Xr_train, yr_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [24]:
y_rfc_pred = model_rfc.predict(Xr_test)

In [25]:
y_rfc_pred

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [26]:
model_rfc.score(Xr_test, yr_test)

0.9561920808761584

In [27]:
print(classification_report(yr_test, y_rfc_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       545
           1       0.95      0.97      0.96       642

    accuracy                           0.96      1187
   macro avg       0.96      0.96      0.96      1187
weighted avg       0.96      0.96      0.96      1187



In [28]:
filename = "model.pkl"

In [29]:
pickle.dump(model_rfc, open(filename, "wb"))