In [9]:
# Importing Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE

In [10]:
# Reading the Dataset
raw_data = pd.read_csv('dataset.csv')
raw_data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [11]:
# Display the Dataset
print(raw_data)

      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL             No  ...   
1        

In [12]:
# Feature Selection
non_numeric_cols = raw_data.select_dtypes(include=['object']).columns.tolist()
raw_data_numeric = raw_data.drop(non_numeric_cols, axis=1)

In [13]:
# Impute Missing Values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(raw_data_numeric)
raw_data_numeric_imputed = pd.DataFrame(imp.fit_transform(raw_data_numeric), columns=raw_data_numeric.columns)

In [14]:
# Concatenation
raw_data_imputed = pd.concat([raw_data[non_numeric_cols], raw_data_numeric_imputed], axis=1)
raw_data_imputed.columns
raw_data_imputed['Churn'].dtype

dtype('O')

In [15]:
# Label Encoding
le = LabelEncoder()
raw_data_imputed['Churn'] = le.fit_transform(raw_data_imputed['Churn'].astype(str))

In [16]:
# Create a DataFrame
df = pd.DataFrame(raw_data_imputed)

In [17]:
# Convert String columns to Binary Float columns
for col in df.select_dtypes(include="object"):
    df[col] = df[col].astype("category").cat.codes.astype("float")


In [18]:
# Display Binary Dataset
print(df)

      customerID  gender  Partner  Dependents  PhoneService  MultipleLines  \
0         5375.0     0.0      1.0         0.0           0.0            1.0   
1         3962.0     1.0      0.0         0.0           1.0            0.0   
2         2564.0     1.0      0.0         0.0           1.0            0.0   
3         5535.0     1.0      0.0         0.0           0.0            1.0   
4         6511.0     0.0      0.0         0.0           1.0            0.0   
...          ...     ...      ...         ...           ...            ...   
7038      4853.0     1.0      1.0         1.0           1.0            2.0   
7039      1525.0     0.0      1.0         1.0           1.0            2.0   
7040      3367.0     0.0      1.0         1.0           0.0            1.0   
7041      5934.0     1.0      1.0         0.0           1.0            2.0   
7042      2226.0     1.0      0.0         0.0           1.0            0.0   

      InternetService  OnlineSecurity  OnlineBackup  DeviceProt

In [19]:
# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

In [20]:
# Clustering Algorithm
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(X_scaled)

In [21]:
# Add Cluster labels to the Dataset
processed_data = df.copy()
processed_data['cluster'] = clusters
non_numeric_cols = raw_data.select_dtypes(include='object').columns

In [22]:
# Splitting Features and Target Variable
X = processed_data.drop(['Churn', 'cluster'], axis=1)
y = processed_data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
# Scaling Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [24]:
# Classification Algorithms
lr = LogisticRegression(max_iter=1000)
svm = SVC(kernel='linear', probability=True, random_state=0)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
bagging = BaggingClassifier(RandomForestClassifier(n_estimators=100, random_state=0),n_estimators=10, random_state=0)
stacking = RandomForestClassifier(n_estimators=100, random_state=0).set_params(n_jobs=-1)

In [25]:
# SMOTE oversampling
sm = SMOTE(random_state=0)
X_train_smote, y_train_smote = sm.fit_resample(X_train_scaled, y_train)

In [26]:
# Train and Evaluate models
models = [('Logistic Regression', lr),
          ('Support Vector Machine', svm),
          ('Random Forest', rf),
          ('Bagging Classifier', bagging),
          ('Stacking Classifier', stacking)]


In [27]:
# Performance Metrics
results = []
for name, model in models:
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test_scaled)
    result = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }
    results.append(result)
    print(result)

{'Model': 'Logistic Regression', 'Accuracy': 0.7423704755145494, 'Recall': 0.779891304347826, 'Precision': 0.5043936731107206, 'F1-Score': 0.6125933831376734}
{'Model': 'Support Vector Machine', 'Accuracy': 0.7331440738112136, 'Recall': 0.7961956521739131, 'Precision': 0.49326599326599324, 'F1-Score': 0.6091476091476091}
{'Model': 'Random Forest', 'Accuracy': 0.6323633782824698, 'Recall': 0.8342391304347826, 'Precision': 0.4018324607329843, 'F1-Score': 0.5424028268551236}
{'Model': 'Bagging Classifier', 'Accuracy': 0.6770759403832506, 'Recall': 0.8125, 'Precision': 0.4364963503649635, 'F1-Score': 0.5679012345679012}
{'Model': 'Stacking Classifier', 'Accuracy': 0.6323633782824698, 'Recall': 0.8342391304347826, 'Precision': 0.4018324607329843, 'F1-Score': 0.5424028268551236}


In [28]:
# Finding Best Model
best_model = max(results, key=lambda x: x["F1-Score"])
best_model_name = best_model["Model"]


In [29]:
# Results of Churn Predictions
total = len(processed_data)
churn = len(processed_data[processed_data['Churn'] == 1])
non_churn = len(processed_data[processed_data['Churn'] == 0])

print(f'Total customers: {total}')
print(f'Churn Customers: {churn}')
print(f'Non-Churn Customers: {non_churn}')

Total customers: 7043
Churn Customers: 1869
Non-Churn Customers: 5174
