In [61]:
import pandas as pd 
import numpy as np
import sklearn 

In [62]:
data=pd.read_csv('../Dataset/combined_data.csv')
df=data.copy()

In [63]:
num_features_to_use=['number_of_dependents', 'total_charges', 'total_long_distance_charges', 'total_revenue', 'tenure', 'number_of_referrals']
dependent_cat_features=['under_30', 'senior_citizen', 'married', 'city', 'zip_code', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'internet_type', 'contract', 'paperless_billing', 'payment_method', 'internet_service.1', 'multiple_lines', 'unlimited_data', 'offer', 'referred_a_friend', 'customer_status', 'churn_reason']
target=['churn_value']
print("No of numerical features:",len(num_features_to_use))
print("No of categorical features:",len(dependent_cat_features))
print("Target feature:",target[0])

No of numerical features: 6
No of categorical features: 24
Target feature: churn_value


Out of those categorical features, we will use contract and city as these two features had the highest feature importance followed by churn_reason.

# OUTLIER REMOVAL

total_revenue,total_long_distance_charges has outliers

In [64]:
contains_outliers=['total_revenue','total_long_distance_charges']
for i in contains_outliers:
    max_limit=df[i].mean()+3*df[i].std()
    min_limit=df[i].mean()-3*df[i].std()
    df=df[(df[i]<=max_limit) & (df[i]>=min_limit)]

In [65]:
df=df.reset_index()

# ENCODING

Can apply One hot encoding directly onto all the columns combined but while applying Label encoder, we will apply it to each feature separetely

In [66]:
from sklearn.preprocessing import LabelEncoder
import pickle

In [67]:
cat_feat=['contract','city']

In [68]:
def cols_to_drop():
    return [cols for cols in df.columns if cols not in cat_feat+num_features_to_use+target+['customer_status']]
df=df.drop(columns=cols_to_drop())

In [69]:
label_encoders={}
for i in cat_feat:
    le=LabelEncoder()
    df[i]=le.fit_transform(df[i])
    label_encoders[i]=le

In [70]:
pickle.dump(label_encoders,open('../Models/label_encoders.sav','wb'))

# SCALING

In [71]:
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler()
df[num_features_to_use]=scaler.fit_transform(df[num_features_to_use])

pickle.dump(scaler,open('../Models/scaler.sav','wb'))

# SPEARMAN RANK CORRELATION

Checking once again for numerical features after removal of outliers and scaling

In [72]:
df_num_features=pd.concat([df[num_features_to_use],data['churn_value']],axis=1)

In [73]:
df_num_features

Unnamed: 0,number_of_dependents,total_charges,total_long_distance_charges,total_revenue,tenure,number_of_referrals,churn_value
0,-0.485996,-0.739211,-0.423611,-0.713674,-0.944096,0.020725,0
1,-0.485996,-0.761916,-0.774423,-0.843327,-0.944096,-0.647645,0
2,-0.485996,-0.878588,-0.727218,-0.912623,-1.149012,-0.647645,1
3,-0.485996,-0.451691,-0.448019,-0.491485,-0.780162,-0.313460,1
4,-0.485996,-0.884587,-0.865501,-0.957406,-1.189996,0.354910,1
...,...,...,...,...,...,...,...
7038,,,,,,,0
7039,,,,,,,1
7040,,,,,,,0
7041,,,,,,,0


In [74]:
correlation=df_num_features.corr()

correlation['churn_value']

number_of_dependents          -0.001232
total_charges                 -0.010621
total_long_distance_charges   -0.004711
total_revenue                 -0.009594
tenure                        -0.018054
number_of_referrals           -0.024031
churn_value                    1.000000
Name: churn_value, dtype: float64

In [75]:
df.to_csv('../Dataset/preprocessed_dataset.csv',index=False)