<a href="https://colab.research.google.com/github/Sid-istic/End-to-End-customer-Churn/blob/main/optimizations/02_FeatureImportance_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adding a new feature to dataset

In [98]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import pickle

In [99]:
churn = pd.read_csv('/content/churn_processed (1).csv')

In [100]:
churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_Monthly
0,0,0,1,0,1,0,1,0,0,2,...,0,0,0,0,1,2,29.85,2505,0,29.85
1,1,0,0,0,34,1,0,0,2,0,...,0,0,0,1,0,3,56.95,1466,0,1936.3
2,1,0,0,0,2,1,0,0,2,2,...,0,0,0,0,1,3,53.85,157,1,107.7
3,1,0,0,0,45,0,1,0,2,0,...,2,0,0,1,0,0,42.3,1400,0,1903.5
4,0,0,0,0,2,1,0,1,0,0,...,0,0,0,0,1,2,70.7,925,1,141.4


In [101]:
numeric_colums = ['tenure', 'MonthlyCharges','Tenure_Monthly' , 'TotalCharges']
churn['TotalCharges'] = pd.to_numeric(churn['TotalCharges'], errors='coerce')
churn['TotalCharges'] = churn['TotalCharges'].fillna(0)

In [102]:
scaler = StandardScaler()
scale = {}

for col in numeric_colums:
  scale[col] = scaler.fit(churn[col].values.reshape(-1,1))
  churn[col] = scale[col].transform(churn[col].values.reshape(-1,1))

churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Tenure_Monthly
0,0,0,1,0,-1.277445,0,1,0,0,2,...,0,0,0,0,1,2,-1.160323,-0.398608,0,-0.993448
1,1,0,0,0,0.066327,1,0,0,2,0,...,0,0,0,1,0,3,-0.259629,-0.948762,0,-0.151588
2,1,0,0,0,-1.236724,1,0,0,2,2,...,0,0,0,0,1,3,-0.36266,-1.641883,1,-0.959071
3,1,0,0,0,0.514251,0,1,0,2,0,...,2,0,0,1,0,0,-0.746535,-0.98371,0,-0.166072
4,0,0,0,0,-1.236724,1,0,1,0,0,...,0,0,0,0,1,2,0.197365,-1.235224,1,-0.944189


In [103]:
with open('scale.pkl', 'wb') as f:
  pickle.dump(scale, f)

In [104]:
model = RandomForestClassifier(random_state=42)

In [105]:
X = churn.drop('Churn', axis=1)
Y = churn['Churn']

smote = SMOTE(random_state=42)
x, y = smote.fit_resample(X, Y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [106]:
model.fit(X_train, y_train)

In [107]:
y_prediction = model.predict(X_test)
accuracy = accuracy_score(y_prediction, y_test)
print(accuracy)

0.8463768115942029


In [108]:
all_features = churn.drop('Churn', axis=1)
all_features.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Tenure_Monthly'],
      dtype='object')

In [109]:
feature_importance = pd.DataFrame({'Feature': all_features.columns, 'Importance': model.feature_importances_})
print(feature_importance.sort_values(by='Importance', ascending=False))

             Feature  Importance
14          Contract    0.135305
17    MonthlyCharges    0.129934
4             tenure    0.129535
19    Tenure_Monthly    0.119425
18      TotalCharges    0.110156
11       TechSupport    0.059197
8     OnlineSecurity    0.051974
16     PaymentMethod    0.042126
7    InternetService    0.025811
9       OnlineBackup    0.024235
3         Dependents    0.024016
10  DeviceProtection    0.022028
2            Partner    0.022019
0             gender    0.021301
15  PaperlessBilling    0.017706
6      MultipleLines    0.017549
12       StreamingTV    0.014360
13   StreamingMovies    0.014358
1      SeniorCitizen    0.013367
5       PhoneService    0.005597


The above table shows the most important features for the model to predict the output
meaning when taking custom input from user,we only want values of important features

In [110]:
def drop_features(df, features):
    return df.drop(features, axis=1)


useless_features = []
for feat in feature_importance['Feature']:
  if feature_importance.loc[feature_importance['Feature'] == feat, 'Importance'].values[0] < 0.025: # getting rid of features whose importance is less than 0.03
    useless_features.append(feat)

useless_features

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineBackup',
 'DeviceProtection',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling']

In [111]:
for feat in useless_features:
  all_features = drop_features(all_features, [feat])
all_features['gender'] = churn['gender'] #new dataset with important features

all_features

Unnamed: 0,tenure,InternetService,OnlineSecurity,TechSupport,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Tenure_Monthly,gender
0,-1.277445,0,0,0,0,2,-1.160323,-0.398608,-0.993448,0
1,0.066327,0,2,0,1,3,-0.259629,-0.948762,-0.151588,1
2,-1.236724,0,2,0,0,3,-0.362660,-1.641883,-0.959071,1
3,0.514251,0,2,2,1,0,-0.746535,-0.983710,-0.166072,1
4,-1.236724,1,0,0,0,2,0.197365,-1.235224,-0.944189,0
...,...,...,...,...,...,...,...,...,...,...
7038,-0.340876,0,2,2,1,3,0.665992,-0.879397,-0.107915,1
7039,1.613701,1,0,0,1,1,1.277533,1.292099,2.274525,0
7040,-0.870241,0,2,0,0,2,-1.168632,-0.139680,-0.862849,0
7041,-1.155283,1,0,0,0,3,0.320338,-0.316534,-0.875214,1


In [114]:
X_new ,Y_new = all_features, churn['Churn']
Y_new.value_counts()
smote = SMOTE(random_state=42)
X_new, Y_new = smote.fit_resample(X_new, Y_new)

X_train, X_test, y_train, y_test = train_test_split(X_new, Y_new, test_size=0.2, random_state=42)

In [115]:
model.fit(X_train, y_train)
y_new_prediction = model.predict(X_test)
accuracy = accuracy_score(y_new_prediction, y_test)
print(accuracy)

0.8285024154589372


In [116]:
all_features['Churn'] = churn['Churn']

In [117]:
all_features.to_csv('new_training_data.csv', index=False)