<a href="https://colab.research.google.com/github/Sid-istic/End-to-End-customer-Churn/blob/main/optimizations/02_FeatureImportance_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adding a new feature to dataset

In [41]:
import pandas as pd
import numpy as np
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle

In [42]:
churn = pd.read_csv('churn_processed.csv')

In [43]:
churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,NumServices
0,1.0,0,0.0,1.0,1,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,29.85,0.0,1.0,1
1,0.0,0,1.0,1.0,34,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,56.95,0.0,1.0,3
2,0.0,0,1.0,1.0,2,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,53.85,0.0,0.0,3
3,0.0,0,1.0,1.0,45,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,42.3,0.0,1.0,3
4,1.0,0,1.0,1.0,2,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,70.7,0.0,0.0,1


In [44]:
model = RandomForestClassifier(random_state=42)

In [45]:
X = churn.drop('Churn', axis=1)
y = churn['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [46]:
model.fit(X_train, y_train)

In [47]:
y_prediction = model.predict(X_test)
accuracy = accuracy_score(y_prediction, y_test)
print(accuracy)

0.7778566359119943


In [48]:
all_features = churn.drop('Churn', axis=1)
all_features.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'NumServices'],
      dtype='object')

In [49]:

feature_importance = pd.DataFrame({'Feature': all_features.columns, 'Importance': model.feature_importances_})
print(feature_importance.sort_values(by='Importance', ascending=False))


             Feature  Importance
17    MonthlyCharges    0.166936
4             tenure    0.157832
14          Contract    0.116488
11       TechSupport    0.087631
8     OnlineSecurity    0.074908
15  PaperlessBilling    0.055421
9       OnlineBackup    0.046343
2            Partner    0.037218
3         Dependents    0.036841
0             gender    0.036709
10  DeviceProtection    0.036562
19       NumServices    0.029423
16     PaymentMethod    0.028343
6      MultipleLines    0.022059
1      SeniorCitizen    0.016138
12       StreamingTV    0.015384
13   StreamingMovies    0.015201
7    InternetService    0.015198
5       PhoneService    0.005191
18      TotalCharges    0.000175


The above table shows the most important features for the model to predict the output
meaning when taking custom input from user,we only want values of important features

In [50]:
def drop_features(df, features):
    return df.drop(features, axis=1)


useless_features = []
for feat in feature_importance['Feature']:
  if feature_importance.loc[feature_importance['Feature'] == feat, 'Importance'].values[0] < 0.001:
    useless_features.append(feat)

useless_features

['TotalCharges']

In [51]:
for feat in useless_features:
  all_features = drop_features(all_features, [feat])
all_features #new dataset with

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,NumServices
0,1.0,0,0.0,1.0,1,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,29.85,1
1,0.0,0,1.0,1.0,34,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,56.95,3
2,0.0,0,1.0,1.0,2,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,53.85,3
3,0.0,0,1.0,1.0,45,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,42.30,3
4,1.0,0,1.0,1.0,2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,70.70,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0,0.0,0.0,24,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.80,6
7039,1.0,0,0.0,0.0,72,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,103.20,5
7040,1.0,0,0.0,0.0,11,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,29.60,1
7041,0.0,1,0.0,1.0,4,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,74.40,1


In [52]:
X_new_train , X_new_test, y_new_train, y_new_test = train_test_split(all_features, y, test_size=0.2, random_state=42)
smote_new = SMOTE(random_state=42)
X_new_train, y_new_train = smote_new.fit_resample(X_new_train, y_new_train)

In [53]:
model.fit(X_new_train, y_new_train)
y_new_prediction = model.predict(X_new_test)
accuracy = accuracy_score(y_new_prediction, y_new_test)
print(accuracy)

0.7849538679914834


In [54]:
with open("optimised_model.pkl", "wb") as f:
    pickle.dump(model, f)