# Preprocessing Data

> # Read

In [4]:
import numpy as np
import scipy as scp
import sklearn as skl
import pandas as pd
import matplotlib
import pickle as pk
from sklearn.svm import SVC

In [5]:
telecom_cust = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn-2.csv");

In [6]:
telecom_cust.TotalCharges = pd.to_numeric(telecom_cust.TotalCharges, errors='coerce')
telecom_cust.MonthlyCharges = pd.to_numeric(telecom_cust.MonthlyCharges, errors='coerce')

> # Removing the missing data

In [7]:
#Removing missing values 
print(len(telecom_cust))
telecom_cust.dropna(inplace = True)
print(len(telecom_cust))

7043
7030


> # Using mean-method in handling the missing data

In [30]:
#Set the missing values by mean-method.
def set_missing_values_by_MEAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        #print(telecom_cust[i].value_counts().index);
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mean(), inplace=True);
        else:
            if (telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes == 'object'):
                #print(telecom_cust[i].value_counts().index);
                telecom_cust[i].fillna(telecom_cust[i].value_counts().index.sort_values()[0], inplace=True);
set_missing_values_by_MEAN_METHOD(telecom_cust);

> # Using mode-method in handling the missing data

In [31]:
#Set the missing values by mode-method.
def set_missing_values_by_MODE_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mode()[0], inplace=True);
set_missing_values_by_MODE_METHOD(telecom_cust);

> # Using median-method in handling the missing data

In [33]:
#Set the missing values by median-method.
def set_missing_values_by_MEDIAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0):
            telecom_cust[i].fillna(telecom_cust[i].median(), inplace=True);
            
set_missing_values_by_MEDIAN_METHOD(telecom_cust);

> # Normalizing data by max-min method

In [34]:
#max-min method
def normalization_data_max_min(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            max_value = telecom_cust[i].max();
            min_value = telecom_cust[i].min();
            telecom_cust[i] = (telecom_cust[i] - min_value)/(max_value - min_value);
normalization_data_max_min(telecom_cust)

> # Normalizing data by z-scores

In [35]:
#z-score method
def normalization_data_z_score(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            meanX = telecom_cust[i].mean();
            sd = telecom_cust[i].std();
            telecom_cust[i] = (telecom_cust[i]-meanX)/sd;
normalization_data_z_score(telecom_cust)

In [15]:
#Remove customer IDs from the data set
df2 = telecom_cust.iloc[:,1:]

In [16]:
#Convertin the predictor variable in a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [17]:
#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df2)

In [18]:
data_set = df_dummies.values
#print(data_set[488])
#for i in data_set:
  #  print(i)

Dividing data set into training set, validation set, test set. Total = 7032
- training set = 5000
- val set = 1000
- test set = 1032

In [19]:
def divide_data_set(data_set, k_train, k_val):
    R = data_set.shape[0];
    C = data_set.shape[1];
    rand_indx = np.random.choice(R, R, replace = True);
    rand_indx_train = rand_indx[np.arange(0,k_train)]
    rand_indx_val = rand_indx[np.arange(k_train,k_val + k_train)];
    rand_indx_test = rand_indx[np.arange(k_train + k_val,R)];
    #------------------------------------------------------------
    X_train = data_set[rand_indx_train];
    Y_train = X_train[:,-1:];
    X_train = X_train[:,0:C-1];
    print("X_train shape = (%d,%d)"%(X_train.shape));
    print("Y_train shape = (%d,%d)"%(Y_train.shape));
    #------------------------------------------------------------
    X_val = data_set[rand_indx_val];
    Y_val = X_val[:,-1:];
    X_val = X_val[:,0:C-1];
    #------------------------------------------------------------
    X_test = data_set[rand_indx_train];
    Y_test = X_test[:,-1:];
    X_test = X_test[:,0:C-1];
    return X_train, Y_train, X_val, Y_val, X_test, Y_test;

In [20]:
#data set was divided: 5000:1000:1032 = train: Validation: test.
X_train, Y_train, X_val, Y_val, X_test, Y_test = divide_data_set(data_set, 5000,1000);

X_train shape = (5000,45)
Y_train shape = (5000,1)


In [21]:
print(X_train)
print(Y_train)

[[-0.44037054  0.43064531  0.34550586 ...  0.          0.
   1.        ]
 [-0.44037054  0.30841189 -0.31756941 ...  0.          0.
   0.        ]
 [ 2.27049191 -0.34349966  0.46682038 ...  0.          0.
   1.        ]
 ...
 [ 2.27049191  0.06394506  0.67787441 ...  0.          1.
   0.        ]
 [-0.44037054  0.145434   -1.48916856 ...  0.          0.
   0.        ]
 [ 2.27049191 -0.79168886  0.17765974 ...  0.          0.
   1.        ]]
[[0.]
 [1.]
 [0.]
 ...
 [0.]
 [1.]
 [0.]]


In [22]:
model = SVC(C=30, kernel='linear');
out_model = model.fit(X_train, Y_train);

  y = column_or_1d(y, warn=True)


In [23]:
print(out_model)

SVC(C=30, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


In [24]:
model_name = "svc_c_30_linear"
pk.dump(out_model, open(model_name, "wb"))

In [25]:
print(out_model.score(X_val,Y_val));

1.0


In [26]:
model2 = SVC(C=300, kernel='linear');
out_model2 = model2.fit(X_train, Y_train);

  y = column_or_1d(y, warn=True)


In [27]:
model_name2 = "svc_c_300_linear"
pk.dump(out_model2, open(model_name2, "wb"))

In [28]:
print(out_model2.score(X_val,Y_val));

1.0
