In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,0,1,Customer_1,63,1,2,17,73.36,236,0
1,1,2,Customer_2,62,0,4,1,48.76,172,0
2,2,3,Customer_3,24,0,2,5,85.47,460,0
3,3,4,Customer_4,36,0,3,3,97.94,297,1
4,4,5,Customer_5,46,0,3,19,58.14,266,0


In [3]:
df2=df.drop(['Unnamed: 0'],axis=1)

In [4]:
df2.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,1,2,17,73.36,236,0
1,2,Customer_2,62,0,4,1,48.76,172,0
2,3,Customer_3,24,0,2,5,85.47,460,0
3,4,Customer_4,36,0,3,3,97.94,297,1
4,5,Customer_5,46,0,3,19,58.14,266,0


In [5]:
#Split data into features(X) and target(Y)
X = df2.drop(['CustomerID', 'Name', 'Churn'], axis=1)
y = df2['Churn']

In [6]:
X

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB
0,63,1,2,17,73.36,236
1,62,0,4,1,48.76,172
2,24,0,2,5,85.47,460
3,36,0,3,3,97.94,297
4,46,0,3,19,58.14,266
...,...,...,...,...,...,...
99995,33,1,1,23,55.13,226
99996,62,0,4,19,61.65,351
99997,64,1,0,17,96.11,251
99998,51,0,4,20,49.25,434


In [7]:
y

0        0
1        0
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    1
99998    1
99999    1
Name: Churn, Length: 100000, dtype: int64

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Feature Engineering

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [11]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

In [12]:
# Train the model
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
# Make predictions
y_pred = rf_model.predict(X_test_scaled)

In [14]:
rf_model.score(X_test_scaled,y_test)

0.49955

In [15]:
#Classification Report
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.50      0.53      0.52     10079
           1       0.50      0.47      0.48      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [16]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the hyperparameter grid
param_distributions = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': [None] + list(np.arange(10, 110, 10)),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the random forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions,
                                   n_iter=2, cv=5, scoring='f1_macro', random_state=42, n_jobs=-1)

# Perform the randomized search on your large dataset
random_search.fit(X_train_scaled, y_train)

# Get the best parameters and best estimator
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_

print("Best Parameters:", best_params)


Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}


In [17]:
random_search.cv_results_

{'mean_fit_time': array([ 43.60287318, 126.88409166]),
 'std_fit_time': array([1.11600721, 9.39547185]),
 'mean_score_time': array([1.14189591, 7.88678746]),
 'std_score_time': array([0.17287052, 3.42183944]),
 'param_n_estimators': masked_array(data=[300, 600],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[5, 5],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[1, 4],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[10, 40],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 300,
   'min_samples_split': 5,
   'min_samples_leaf': 1,
   'max_depth': 10},
  {'n_estimators': 600,
   'min_samples_split': 5,
   'min_samples_leaf': 4,
   'max_depth': 40}],
 'split0_test_score': array

In [20]:
dt = pd.DataFrame(random_search.cv_results_)
dt

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,43.602873,1.116007,1.141896,0.172871,300,5,1,10,"{'n_estimators': 300, 'min_samples_split': 5, ...",0.505412,0.49653,0.495218,0.499746,0.50166,0.499713,0.00365,1
1,126.884092,9.395472,7.886787,3.421839,600,5,4,40,"{'n_estimators': 600, 'min_samples_split': 5, ...",0.501824,0.498155,0.487933,0.502071,0.501049,0.498206,0.005323,2


In [21]:
dt[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'n_estimators': 300, 'min_samples_split': 5, ...",0.499713
1,"{'n_estimators': 600, 'min_samples_split': 5, ...",0.498206


In [22]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model
logreg_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_logreg = logreg_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred, labels=[0,1]))


              precision    recall  f1-score   support

           0       0.50      0.53      0.52     10079
           1       0.50      0.47      0.48      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [23]:
from sklearn.neural_network import MLPClassifier

# Initialize the Neural Network (MLP) model
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Train the model
mlp_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_mlp = mlp_model.predict(X_test_scaled)

print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.50      0.53      0.52     10079
           1       0.50      0.47      0.48      9921

    accuracy                           0.50     20000
   macro avg       0.50      0.50      0.50     20000
weighted avg       0.50      0.50      0.50     20000



In [29]:
#pickiling the model

import joblib

joblib.dump(mlp_model, 'mlp_model.pkl')


['mlp_model.pkl']

In [30]:
load_model = joblib.load('mlp_model.pkl')

In [31]:
model_score = load_model.score(X_test_scaled, y_test)

In [32]:
model_score

0.49955