In [317]:
import pandas as pd 
import numpy as np
import sklearn 

In [318]:
data=pd.read_csv('../Dataset/combined_data.csv')
df=data.copy()

In [319]:
num_features_to_use=['number_of_dependents', 'total_charges', 'total_long_distance_charges', 'total_revenue', 'tenure', 'number_of_referrals']
dependent_cat_features=['under_30', 'senior_citizen', 'married', 'city', 'zip_code', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'premium_tech_support', 'streaming_tv', 'streaming_movies', 'streaming_music', 'internet_type', 'contract', 'paperless_billing', 'payment_method', 'internet_service.1', 'multiple_lines', 'unlimited_data', 'offer', 'referred_a_friend', 'customer_status', 'churn_reason']
target=['churn_value']
print("No of numerical features:",len(num_features_to_use))
print("No of categorical features:",len(dependent_cat_features))
print("Target feature:",target[0])

No of numerical features: 6
No of categorical features: 24
Target feature: churn_value


Out of those categorical features, we will use contract and city as these two features had the highest feature importance followed by churn_reason.

In [320]:
cat_feat=['contract','city']
def cols_to_drop():
    return [cols for cols in df.columns if cols not in cat_feat+num_features_to_use+target+['customer_status']]
df=df.drop(columns=cols_to_drop())

# OUTLIER REMOVAL

total_revenue,total_long_distance_charges has outliers

In [321]:
def remove_outliers(df):
    contains_outliers=['total_revenue','total_long_distance_charges']
    for i in contains_outliers:
        max_limit=df[i].mean()+3*df[i].std()
        min_limit=df[i].mean()-3*df[i].std()
        df=df[(df[i]<=max_limit) & (df[i]>=min_limit)]
        return df

df=remove_outliers(df)


In [322]:
df.reset_index(drop=True,inplace=True)

# BALANCING THE DATASET 

In [323]:
x=df.iloc[:,0:8]
y=df['customer_status']

In [324]:
print("No of entries with class as Churned before over sampling:",df[df['customer_status']=='Churned'].shape)
print("No of entries with class as Not Churned before over sampling:",df[(df['customer_status']=='Joined') | (df['customer_status']=='Stayed')].shape)

No of entries with class as Churned before over sampling: (1869, 10)
No of entries with class as Not Churned before over sampling: (5169, 10)


In [326]:
from imblearn.over_sampling import SMOTENC
sampler=SMOTENC(sampling_strategy={'Churned':5169,"Stayed":5169,"Joined":5169},categorical_features=['city','contract'])
x_resampled,y_resampled=sampler.fit_resample(x,y)

In [327]:
df=pd.concat([x_resampled,y_resampled],axis=1)

In [329]:
print("No of entries with class as Churned after over sampling:",df[df['customer_status']=='Churned'].shape)
print("No of entries with class as Not Churned after over sampling:",df[(df['customer_status']=='Joined') | (df['customer_status']=='Stayed')].shape)

No of entries with class as Churned after over sampling: (5169, 9)
No of entries with class as Not Churned after over sampling: (10338, 9)


As 5169 samples for Churned,5169 samples for Joined and 5169 samples for Stayed.

In [330]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(df,test_size=0.2)

# ENCODING

Can apply One hot encoding directly onto all the columns combined but while applying Label encoder, we will apply it to each feature separetely

In [331]:
from sklearn.preprocessing import LabelEncoder
import pickle

In [332]:
label_encoders={}
for i in cat_feat:
    if i=='city':
        train[i]=train[i].str.lower()
    le=LabelEncoder()
    train[i]=le.fit_transform(train[i])
    label_encoders[i]=le

In [333]:
for i in cat_feat:
    if i=='city':
        test[i]=test[i].str.lower()
    test[i]=label_encoders[i].transform(test[i])

In [334]:
pickle.dump(label_encoders,open('../Models/label_encoders.sav','wb'))

# SCALING

In [335]:
train

Unnamed: 0,number_of_dependents,city,contract,total_charges,total_long_distance_charges,total_revenue,tenure,number_of_referrals,customer_status
11727,0,210,0,46.622001,97.009698,143.631699,2,1,Joined
13309,0,1058,0,71.146913,36.627003,107.773916,1,0,Joined
6640,0,12,1,943.100000,1405.000000,2348.100000,50,1,Stayed
10152,0,1015,0,69.456620,45.904070,115.360690,1,0,Churned
6653,0,52,1,1654.450000,925.400000,2579.850000,28,0,Stayed
...,...,...,...,...,...,...,...,...,...
4226,2,470,2,1160.450000,736.960000,1897.410000,47,3,Stayed
9276,0,584,0,85.581907,3.747934,93.756331,1,0,Churned
10684,0,679,0,20.213852,7.547547,27.761398,1,0,Joined
5186,2,292,0,1936.850000,626.520000,2563.370000,23,5,Stayed


In [336]:
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler()
train[num_features_to_use]=scaler.fit_transform(train[num_features_to_use])
test[num_features_to_use]=scaler.transform(test[num_features_to_use])

pickle.dump(scaler,open('../Models/scaler.sav','wb'))

# SPEARMAN RANK CORRELATION

Checking once again for numerical features after removal of outliers and scaling

In [337]:
df_num_features=pd.concat([df[num_features_to_use],data['churn_value']],axis=1)

In [338]:
df_num_features

Unnamed: 0,number_of_dependents,total_charges,total_long_distance_charges,total_revenue,tenure,number_of_referrals,churn_value
0,0,593.300000,381.510000,974.810000,9,2,0.0
1,0,542.400000,96.210000,610.280000,9,0,0.0
2,0,280.850000,134.600000,415.450000,4,0,1.0
3,0,1237.850000,361.660000,1599.510000,13,1,1.0
4,0,267.400000,22.140000,289.540000,3,3,1.0
...,...,...,...,...,...,...,...
15502,0,238.509211,206.700722,445.209933,12,0,
15503,2,5264.429989,711.509843,5975.939832,59,1,
15504,0,2322.794723,651.697943,2965.185013,32,0,
15505,0,1463.235769,438.901702,1981.331125,18,0,


In [339]:
correlation=df_num_features.corr()

correlation['churn_value']

number_of_dependents           0.020656
total_charges                  0.008246
total_long_distance_charges    0.012059
total_revenue                  0.010030
tenure                        -0.002873
number_of_referrals           -0.013085
churn_value                    1.000000
Name: churn_value, dtype: float64

In [340]:
train.to_csv('../Dataset/preprocessed_train_dataset.csv',index=False)
test.to_csv('../Dataset/preprocessed_test_dataset.csv',index=False)