# Preprocessing Data

> # Read

In [53]:
import numpy as np
import scipy as scp
import sklearn as skl
import pandas as pd
import matplotlib
import pickle as pk
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [54]:
telecom_cust = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv");

In [55]:
telecom_cust.TotalCharges = pd.to_numeric(telecom_cust.TotalCharges, errors='coerce')
telecom_cust.MonthlyCharges = pd.to_numeric(telecom_cust.MonthlyCharges, errors='coerce')

> # Removing the missing data

In [56]:
#Removing missing values 
print(len(telecom_cust))
telecom_cust.dropna(inplace = True)
print(len(telecom_cust))

7043
7032


> # Using mean-method in handling the missing data

In [57]:
#Set the missing values by mean-method.
def set_missing_values_by_MEAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        #print(telecom_cust[i].value_counts().index);
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mean(), inplace=True);
        else:
            if (telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes == 'object'):
                #print(telecom_cust[i].value_counts().index);
                telecom_cust[i].fillna(telecom_cust[i].value_counts().index.sort_values()[0], inplace=True);
set_missing_values_by_MEAN_METHOD(telecom_cust);

> # Using mode-method in handling the missing data

In [58]:
#Set the missing values by mode-method.
def set_missing_values_by_MODE_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mode()[0], inplace=True);
set_missing_values_by_MODE_METHOD(telecom_cust);

> # Using median-method in handling the missing data

In [59]:
#Set the missing values by median-method.
def set_missing_values_by_MEDIAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0):
            telecom_cust[i].fillna(telecom_cust[i].median(), inplace=True);
            
set_missing_values_by_MEDIAN_METHOD(telecom_cust);

> # Normalizing data by max-min method

In [60]:
#max-min method
def normalization_data_max_min(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            max_value = telecom_cust[i].max();
            min_value = telecom_cust[i].min();
            telecom_cust[i] = (telecom_cust[i] - min_value)/(max_value - min_value);
normalization_data_max_min(telecom_cust)

> # Normalizing data by z-scores

In [61]:
#z-score method
def normalization_data_z_score(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            meanX = telecom_cust[i].mean();
            sd = telecom_cust[i].std();
            telecom_cust[i] = (telecom_cust[i]-meanX)/sd;
normalization_data_z_score(telecom_cust)

> # Remove a column

In [62]:
#Remove customer IDs from the data set
df2 = telecom_cust.iloc[:,1:]

> # Converting the string to int in the classed var

In [63]:
#Convertin the predictor variable in a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No',  value=0, inplace=True)

> # Convert all off categorical variables into dummy var

In [77]:
#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df2)
label = df_dummies['Churn'].copy();
df_dummies.drop("Churn", axis=1, inplace = True);
#df_dummies['Churn']=label;
df_dummies.to_csv("CustomerChurn.csv");
print(df_dummies.head())

   SeniorCitizen    tenure  MonthlyCharges  TotalCharges  gender_Female  \
0      -0.440296 -1.280157       -1.161611     -0.994123              1   
1      -0.440296  0.064298       -0.260859     -0.173727              0   
2      -0.440296 -1.239416       -0.363897     -0.959581              0   
3      -0.440296  0.512450       -0.747797     -0.195234              0   
4      -0.440296 -1.239416        0.196164     -0.940391              1   

   gender_Male  Partner_No  Partner_Yes  Dependents_No  Dependents_Yes  ...  \
0            0           0            1              1               0  ...   
1            1           1            0              1               0  ...   
2            1           1            0              1               0  ...   
3            1           1            0              1               0  ...   
4            0           1            0              1               0  ...   

   StreamingMovies_Yes  Contract_Month-to-month  Contract_One year  \
0   

In [67]:
X = df_dummies.values;
Y = label.values;
print(X);
print(Y);

[[-0.44029578 -1.280157   -1.16161133 ...  0.          1.
   0.        ]
 [-0.44029578  0.06429811 -0.26085937 ...  0.          0.
   1.        ]
 [-0.44029578 -1.23941594 -0.36389742 ...  0.          0.
   1.        ]
 ...
 [-0.44029578 -0.87274636 -1.16992085 ...  0.          1.
   0.        ]
 [ 2.27087753 -1.15793381  0.31914512 ...  0.          0.
   1.        ]
 [-0.44029578  1.36801217  1.35783512 ...  0.          0.
   0.        ]]
[0 0 1 ... 0 1 0]


> # Divide data set

Dividing data set into training set, validation set, test set. Total = 7032
- training set = 7032/2
- val set = 7032/4
- test set = 7032/4

In [68]:
#data set was divided: 5000:1000:1032 = train: Validation: test.
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2);
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.25)

In [69]:
print("X train shape = (%d , %d)"%X_train.shape + ", Y train shape = (%d)"%Y_train.shape);
print("X valid shape = (%d , %d)"%X_val.shape + ", Y valid shape = (%d)"%Y_val.shape);
print("X test  shape = (%d , %d)"%X_test.shape + ", Y test  shape = (%d)"%Y_test.shape);

X train shape = (4218 , 45), Y train shape = (4218)
X valid shape = (1407 , 45), Y valid shape = (1407)
X test  shape = (1407 , 45), Y test  shape = (1407)


> # Training and test

In [70]:
model = SVC(C=0.5, kernel='linear');
out_model = model.fit(X_train, Y_train.flatten());

In [71]:
print(out_model)

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


In [72]:
model_name = "svc_c_30_linear"
pk.dump(out_model, open(model_name, "wb"))

In [73]:
print(out_model.score(X_val,Y_val));

0.7846481876332623


In [74]:
model2 = SVC(C=3000, kernel='linear');
out_model2 = model2.fit(X_train, Y_train);

In [75]:
model_name2 = "svc_c_300_linear"
pk.dump(out_model2, open(model_name2, "wb"))

In [76]:
print(out_model2.score(X_val,Y_val));

0.783226723525231
