In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("churn.csv")

In [19]:
print(df.head())

   gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
0       0              0        1           0       1             0   
1       1              0        0           0      34             1   
2       1              0        0           0       2             1   
3       1              0        0           0      45             0   
4       0              0        0           0       2             1   

   MultipleLines  InternetService  OnlineSecurity  OnlineBackup  ...  \
0              0                1               0             1  ...   
1              0                1               1             0  ...   
2              0                1               1             1  ...   
3              0                1               1             0  ...   
4              0                2               0             0  ...   

   StreamingMovies  PaperlessBilling  MonthlyCharges  TotalCharges  Churn  \
0                0                 1              29           

In [18]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype
---  ------                                 --------------  -----
 0   gender                                 7032 non-null   int32
 1   SeniorCitizen                          7032 non-null   int32
 2   Partner                                7032 non-null   int32
 3   Dependents                             7032 non-null   int32
 4   tenure                                 7032 non-null   int32
 5   PhoneService                           7032 non-null   int32
 6   MultipleLines                          7032 non-null   int32
 7   InternetService                        7032 non-null   int32
 8   OnlineSecurity                         7032 non-null   int32
 9   OnlineBackup                           7032 non-null   int32
 10  DeviceProtection                       7032 non-null   int32
 11  TechSupport                        

In [6]:
print(df["OnlineBackup"].value_counts())

OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64


In [7]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["MonthlyCharges"] = pd.to_numeric(df["MonthlyCharges"], errors="coerce")

In [8]:
df = df.dropna()

In [9]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
df["Partner"] = df["Partner"].map({"Yes": 1, "No": 0})
df["gender"] = df["gender"].map({"Male": 1, "Female": 0})
df["Dependents"] = df["Dependents"].map({"Yes": 1, "No": 0})
df["PhoneService"] = df["PhoneService"].map({"Yes": 1, "No": 0})
df["MultipleLines"] = df["MultipleLines"].map({"Yes": 1, "No": 0,"No phone service":0})
df["InternetService"] = df["InternetService"].map({"Fiber optic":2,"DSL": 1, "No": 0})
df["OnlineSecurity"] = df["OnlineSecurity"].map({"Yes": 1, "No": 0,"No internet service":0})
df["DeviceProtection"] = df["DeviceProtection"].map({"Yes": 1, "No": 0,"No internet service":0})
df["TechSupport"] = df["TechSupport"].map({"Yes": 1, "No": 0,"No internet service":0})
df["StreamingTV"] = df["StreamingTV"].map({"Yes": 1, "No": 0,"No internet service":0})
df["StreamingMovies"] = df["StreamingMovies"].map({"Yes": 1, "No": 0,"No internet service":0})

df["OnlineBackup"] = df["OnlineBackup"].map({"Yes": 1, "No": 0,"No internet service":0})
df["PaperlessBilling"] = df["PaperlessBilling"].map({"Yes": 1, "No": 0})

In [10]:
df = df.drop(columns=["customerID"])

In [11]:
cat_cols = df.select_dtypes(include=["object"]).columns

In [12]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df = df.astype(int)

In [13]:
X = df.drop("Churn", axis=1)
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)
# model = RandomForestClassifier(class_weight="balanced")
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)
model.fit(X_res, y_res)

model.fit(X_res, y_res)

preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.7562189054726368
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      1033
           1       0.54      0.61      0.57       374

    accuracy                           0.76      1407
   macro avg       0.69      0.71      0.70      1407
weighted avg       0.77      0.76      0.76      1407



In [24]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=550,
    max_depth=3,
    learning_rate=0.3,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_res, y_res)
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

Accuracy: 0.7512437810945274
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1033
           1       0.53      0.57      0.55       374

    accuracy                           0.75      1407
   macro avg       0.69      0.69      0.69      1407
weighted avg       0.76      0.75      0.75      1407



from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)
model.fit(X_res, y_res)


In [16]:
sample = X_test.iloc[0]
prediction = model.predict([sample])
print("Churn" if prediction[0] == 1 else "Not Churn")

Not Churn
