In [61]:
import pandas as pd 
import numpy as np
import sklearn 

In [62]:
data=pd.read_csv('../Dataset/combined_data.csv')
df=data.copy()

In [63]:
num_features_to_use=['number_of_dependents', 'total_charges', 'total_long_distance_charges', 'total_revenue', 'tenure', 'number_of_referrals']
dependent_cat_features=['under_30', 'senior_citizen', 'married', 'city', 'zip_code', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'internet_type', 'contract', 'paperless_billing', 'payment_method', 'internet_service.1', 'multiple_lines', 'unlimited_data', 'offer', 'referred_a_friend', 'customer_status', 'churn_reason']
target=['churn_value']
print("No of numerical features:",len(num_features_to_use))
print("No of categorical features:",len(dependent_cat_features))
print("Target feature:",target[0])

No of numerical features: 6
No of categorical features: 24
Target feature: churn_value


Out of those categorical features, we will use contract and city as these two features had the highest feature importance followed by churn_reason.

# OUTLIER REMOVAL

total_revenue,total_long_distance_charges has outliers

In [64]:
contains_outliers=['total_revenue','total_long_distance_charges']
for i in contains_outliers:
    max_limit=df[i].mean()+3*df[i].std()
    min_limit=df[i].mean()-3*df[i].std()
    df=df[(df[i]<=max_limit) & (df[i]>=min_limit)]

In [65]:
df=df.reset_index()

# BALANCING THE DATASET 

In [68]:
cat_feat=['contract','city']
def cols_to_drop():
    return [cols for cols in df.columns if cols not in cat_feat+num_features_to_use+target+['customer_status']]
df=df.drop(columns=cols_to_drop())

In [69]:
x=df.iloc[:,0:8]
y=df['customer_status']

In [72]:
from imblearn.over_sampling import SMOTEN
sampler=SMOTEN(sampling_strategy={'Churned':5174,"Stayed":5174,"Joined":5174})
x_resampled,y_resampled=sampler.fit_resample(x,y)

In [73]:
df=pd.concat([x_resampled,y_resampled],axis=1)

In [75]:
print("No of entries with class as Churned:",df[df['customer_status']=='Churned'].shape)
print("No of entries with class as Not Churned:",df[(df['customer_status']=='Joined') | (df['customer_status']=='Stayed')].shape)

No of entries with class as Churned: (5174, 9)
No of entries with class as Not Churned: (10348, 9)


As 5174 samples for Churned,5174 samples for Joined and 5174 samples for Stayed.

# ENCODING

Can apply One hot encoding directly onto all the columns combined but while applying Label encoder, we will apply it to each feature separetely

In [76]:
from sklearn.preprocessing import LabelEncoder
import pickle

In [77]:
label_encoders={}
for i in cat_feat:
    if i=='city':
        df[i]=df[i].str.lower()
    le=LabelEncoder()
    df[i]=le.fit_transform(df[i])
    label_encoders[i]=le

In [78]:
pickle.dump(label_encoders,open('../Models/label_encoders.sav','wb'))

# SCALING

In [79]:
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler()
df[num_features_to_use]=scaler.fit_transform(df[num_features_to_use])

pickle.dump(scaler,open('../Models/scaler.sav','wb'))

# SPEARMAN RANK CORRELATION

Checking once again for numerical features after removal of outliers and scaling

In [80]:
df_num_features=pd.concat([df[num_features_to_use],data['churn_value']],axis=1)

In [81]:
df_num_features

Unnamed: 0,number_of_dependents,total_charges,total_long_distance_charges,total_revenue,tenure,number_of_referrals,churn_value
0,-0.382357,-0.343653,0.028284,-0.279578,-0.441044,0.336161,0.0
1,-0.382357,-0.370821,-0.412370,-0.432433,-0.441044,-0.476151,0.0
2,-0.382357,-0.510427,-0.353076,-0.514129,-0.659444,-0.476151,1.0
3,-0.382357,0.000385,-0.002374,-0.017630,-0.266324,-0.069995,1.0
4,-0.382357,-0.517606,-0.526774,-0.566925,-0.703124,0.742317,1.0
...,...,...,...,...,...,...,...
15517,-0.382357,-0.058916,2.043913,0.557400,1.087757,-0.476151,
15518,-0.382357,-0.536448,-0.346928,-0.338623,-0.266324,-0.476151,
15519,3.419656,0.075433,0.678208,0.226096,2.267119,3.179253,
15520,-0.382357,-0.280748,-0.324239,-0.231944,-0.266324,-0.069995,


In [82]:
correlation=df_num_features.corr()

correlation['churn_value']

number_of_dependents          -0.000679
total_charges                 -0.010025
total_long_distance_charges   -0.003884
total_revenue                 -0.009067
tenure                        -0.017707
number_of_referrals           -0.023341
churn_value                    1.000000
Name: churn_value, dtype: float64

In [83]:
df.to_csv('../Dataset/preprocessed_dataset.csv',index=False)