<a href="https://colab.research.google.com/github/Sid-istic/End-to-End-customer-Churn/blob/main/optimizations/02_FeatureImportance_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adding a new feature to dataset

In [116]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle

In [117]:
churn = pd.read_csv('churn_processed.csv')

In [118]:
churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_Monthly
0,1.0,0,0.0,1.0,1,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,29.85,0.0,1.0,29.85
1,0.0,0,1.0,1.0,34,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,56.95,0.0,1.0,1936.3
2,0.0,0,1.0,1.0,2,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,53.85,0.0,0.0,107.7
3,0.0,0,1.0,1.0,45,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,42.3,0.0,1.0,1903.5
4,1.0,0,1.0,1.0,2,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,70.7,0.0,0.0,141.4


In [119]:

scaler = StandardScaler()
churn['Tenure_Monthly'] = scaler.fit_transform(churn['Tenure_Monthly'].values.reshape(-1, 1))

In [120]:
model = RandomForestClassifier(random_state=42)

In [121]:
X = churn.drop('Churn', axis=1)
y = churn['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [122]:
model.fit(X_train, y_train)

In [123]:
y_prediction = model.predict(X_test)
accuracy = accuracy_score(y_prediction, y_test)
print(accuracy)

0.7913413768630234


In [124]:
all_features = churn.drop('Churn', axis=1)
all_features.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Tenure_Monthly'],
      dtype='object')

In [125]:

feature_importance = pd.DataFrame({'Feature': all_features.columns, 'Importance': model.feature_importances_})
print(feature_importance.sort_values(by='Importance', ascending=False))


             Feature  Importance
19    Tenure_Monthly    0.131961
17    MonthlyCharges    0.128682
4             tenure    0.115093
14          Contract    0.113493
11       TechSupport    0.085785
8     OnlineSecurity    0.078793
15  PaperlessBilling    0.053780
9       OnlineBackup    0.047375
10  DeviceProtection    0.036585
3         Dependents    0.034849
2            Partner    0.033241
0             gender    0.031151
16     PaymentMethod    0.025669
6      MultipleLines    0.019385
13   StreamingMovies    0.015858
7    InternetService    0.014641
12       StreamingTV    0.014627
1      SeniorCitizen    0.013246
5       PhoneService    0.005559
18      TotalCharges    0.000228


The above table shows the most important features for the model to predict the output
meaning when taking custom input from user,we only want values of important features

In [126]:
def drop_features(df, features):
    return df.drop(features, axis=1)


useless_features = []
for feat in feature_importance['Feature']:
  if feature_importance.loc[feature_importance['Feature'] == feat, 'Importance'].values[0] < 0.03: # getting rid of features whose importance is less than 0.03
    useless_features.append(feat)

useless_features

['SeniorCitizen',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'StreamingTV',
 'StreamingMovies',
 'PaymentMethod',
 'TotalCharges']

In [127]:
for feat in useless_features:
  all_features = drop_features(all_features, [feat])
all_features #new dataset with important features

Unnamed: 0,gender,Partner,Dependents,tenure,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,MonthlyCharges,Tenure_Monthly
0,1.0,0.0,1.0,1,1.0,0.0,1.0,1.0,1.0,0.0,29.85,-0.993448
1,0.0,1.0,1.0,34,0.0,1.0,0.0,1.0,0.0,1.0,56.95,-0.151588
2,0.0,1.0,1.0,2,0.0,0.0,1.0,1.0,1.0,0.0,53.85,-0.959071
3,0.0,1.0,1.0,45,0.0,1.0,0.0,0.0,0.0,1.0,42.30,-0.166072
4,1.0,1.0,1.0,2,1.0,1.0,1.0,1.0,1.0,0.0,70.70,-0.944189
...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.0,0.0,24,0.0,1.0,0.0,0.0,0.0,0.0,84.80,-0.107915
7039,1.0,0.0,0.0,72,1.0,0.0,0.0,1.0,0.0,0.0,103.20,2.274525
7040,1.0,0.0,0.0,11,0.0,1.0,1.0,1.0,1.0,0.0,29.60,-0.862849
7041,0.0,0.0,1.0,4,1.0,1.0,1.0,1.0,1.0,0.0,74.40,-0.875214


In [128]:
X_new_train , X_new_test, y_new_train, y_new_test = train_test_split(all_features, y, test_size=0.2, random_state=42)
smote_new = SMOTE(random_state=42)
X_new_train, y_new_train = smote_new.fit_resample(X_new_train, y_new_train)

In [129]:
model.fit(X_new_train, y_new_train)
y_new_prediction = model.predict(X_new_test)
accuracy = accuracy_score(y_new_prediction, y_new_test)
print(accuracy)

0.7778566359119943


In [130]:
all_features['Churn'] = churn['Churn']

In [131]:
all_features.to_csv('new_training_data.csv', index=False)