# Preprocessing Data

> # Read

In [1]:
import numpy as np
import scipy as scp
import sklearn as skl
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pk
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
import time
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
from IPython.core.display import HTML 
import pydot
import pydotplus
from sklearn.externals.six import StringIO

In [2]:
telecom_cust = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv");

In [3]:
telecom_cust.TotalCharges = pd.to_numeric(telecom_cust.TotalCharges, errors='coerce')
telecom_cust.MonthlyCharges = pd.to_numeric(telecom_cust.MonthlyCharges, errors='coerce')

> # Removing the missing data

In [4]:
def removing_missing_values(telecom_cust):
    telecom_cust.dropna(inplace = True);
    telecom_cust.reset_index(drop=True, inplace = True);
    return;

In [5]:
print(telecom_cust.shape);
removing_missing_values(telecom_cust);
print(telecom_cust.shape);

(7043, 21)
(7032, 21)


In [12]:
telecom_cust.to_csv("WA_Fn-UseC_-Telco-Customer-Churn-standard.csv")

> # Removing the duplicates

In [16]:
def removing_duplicates(telecom_cust):
    telecom_cust = pd.DataFrame.drop_duplicates(telecom_cust);
    telecom_cust.reset_index(drop=True, inplace = True);
    return;

In [17]:
removing_duplicates(telecom_cust);
print(telecom_cust.shape)

(7032, 21)


> # Using mean-method in handling the missing data

In [29]:
#Set the missing values by mean-method.
def set_missing_values_by_MEAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        #print(telecom_cust[i].value_counts().index);
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mean(), inplace=True);
        else:
            if (telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes == 'object'):
                #print(telecom_cust[i].value_counts().index);
                telecom_cust[i].fillna(telecom_cust[i].value_counts().index.sort_values()[0], inplace=True);
    return;

In [30]:
set_missing_values_by_MEAN_METHOD(telecom_cust);

> # Using mode-method in handling the missing data

In [None]:
#Set the missing values by mode-method.
def set_missing_values_by_MODE_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0 and telecom_cust[i].dtypes != 'object'):
            telecom_cust[i].fillna(telecom_cust[i].mode()[0], inplace=True);
    return;

In [None]:
set_missing_values_by_MODE_METHOD(telecom_cust);

> # Using median-method in handling the missing data

In [None]:
#Set the missing values by median-method.
def set_missing_values_by_MEDIAN_METHOD(telecom_cust):
    list_name_of_attributes = telecom_cust.columns;
    for i in list_name_of_attributes:
        if(telecom_cust[i].isnull().sum()>0):
            telecom_cust[i].fillna(telecom_cust[i].median(), inplace=True);
    return;

In [None]:
set_missing_values_by_MEDIAN_METHOD(telecom_cust);

> # Normalizing data by max-min method

In [8]:
#max-min method
def normalization_data_max_min(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            max_value = telecom_cust[i].max();
            min_value = telecom_cust[i].min();
            telecom_cust[i] = (telecom_cust[i] - min_value)/(max_value - min_value);
    return;

In [9]:
normalization_data_max_min(telecom_cust)

> # Normalizing data by z-scores

In [None]:
#z-score method
def normalization_data_z_score(telecom_cust):
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            meanX = telecom_cust[i].mean();
            sd = telecom_cust[i].std();
            telecom_cust[i] = (telecom_cust[i]-meanX)/sd;
    return;

In [None]:
normalization_data_z_score(telecom_cust)

> # Outlier detection by zscores for all rows with one method:

In [None]:
#print(telecom_cust["TotalCharges"]);
def outlier_detection_by_zscores(telecom_cust, start, end):
    print("Condition: Data frame must be normalized by z-scores before");
    for i in telecom_cust.columns:
        if telecom_cust[i].dtypes != 'object':
            telecom_cust["outlier"] = telecom_cust[i].apply(lambda x: x <=start or x >= end);
            outlier_indx = telecom_cust[telecom_cust.outlier==True].index.values;
            if (len(outlier_indx)>0):
                telecom_cust.iloc[outlier_indx,:]=np.nan;
            telecom_cust.drop("outlier", axis=1, inplace=True);
    removing_missing_values(telecom_cust);
    return;

In [None]:
outlier_detection_by_zscores(telecom_cust,-2.5, 2.5)
print(telecom_cust.shape)

> # Remove a column

In [6]:
#Remove customer IDs from the data set
df2 = telecom_cust.iloc[:,1:]

> # Converting the string to int in the classed var

In [7]:
#Convertin the predictor variable in a binary numeric variable
df2['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df2['Churn'].replace(to_replace='No',  value=0, inplace=True)

> # Convert all off categorical variables into dummy var

In [8]:
#Let's convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df2);
print(df_dummies.shape)
df_dummies_dbscan = pd.get_dummies(df2);
#label = df_dummies['Churn'].copy();
#df_dummies.drop("Churn", axis=1, inplace = True);
#df_dummies['Churn']=label;
#df_dummies.to_csv("CustomerChurn.csv");

(7032, 46)


> # Data Discretisation by Decision Tree

In [181]:
def data_discretisation_by_decision_tree(df, col_1_name, col_2_name, Test_size = 0.3, Max_depth = 3):
    tree_model = DecisionTreeClassifier(max_depth = Max_depth);
    #print(tree_model);
    X_train, X_test, Y_train, Y_test = train_test_split(df[col_1_name], df[col_2_name], test_size = Test_size);
    #print(X_train['TotalCharges'].dtypes);
    #print(Y_train['Churn'].dtypes);
    tree_model.fit(X_train.to_frame(), Y_train.to_frame());
    #print("X train shape = (%d , %d)"%X_train.shape + ", Y train shape = (%d , %d)"%Y_train.shape);
    label = tree_model.predict_proba(X_train.to_frame());
    X_train = X_train.to_frame();
    #print(X_train.head(3));
    new_col_name = col_1_name + "_" + col_2_name + "_Proba"
    X_train[col_2_name] = Y_train;
    X_train[new_col_name] = label[:, 1];
    
    #fig = plt.figure();
    #fig = X_train.groupby([new_col_name])[col_2_name].mean().plot();
    
    #X_train.groupby([new_col_name])[col_2_name].count().plot.bar();
    
    #interval = pd.concat( [X_train.groupby([new_col_name])[col_1_name].min(), X_train.groupby([new_col_name])[col_1_name].max()], axis=1)

    with open("tree_model.txt", "w") as f:
        f = sklearn.tree.export_graphviz(tree_model, out_file=f);
    dotfile = StringIO()
    sklearn.tree.export_graphviz(tree_model, out_file=dotfile)
    (graph, ) = pydot.graph_from_dot_data(dotfile.getvalue());
    graph.write_png("tree_model.png");
    

In [182]:
data_discretisation_by_decision_tree(df_dummies, 'TotalCharges', 'Churn', Max_depth = 2, Test_size = 0.2);

FileNotFoundError: [WinError 2] "dot" not found in path.

> # Create X, Y matrix for training

In [13]:
X = df_dummies.values;
Y = label.values;

> # Outlier detection by DBSCAN for all rows with one method:

In [14]:
def outlier_detection_by_DBSCAN(df, Eps, Metric, Min_samples, N_jobs):
    print("Condition: Data must be normalized by max-min method")
    outlier_detection = DBSCAN(eps=Eps, metric=Metric, min_samples = Min_samples, n_jobs = N_jobs);
    df['outlier'] = outlier_detection.fit_predict(df);
    outlier_indx = df[df.outlier == 0].index.values;
    if (len(outlier_indx)>0):
        df.iloc[outlier_indx,:]=np.nan;
    df.drop("outlier", axis=1, inplace=True);
    removing_missing_values(telecom_cust);
    return;

In [15]:
print(df_dummies_dbscan.shape)
outlier_detection_by_DBSCAN(df = df_dummies_dbscan, Eps = 2, Metric='euclidean', Min_samples = 5, N_jobs=-1);
print(df_dummies_dbscan.shape)

(7032, 46)
Condition: Data must be normalized by max-min method
(7032, 46)


> # Outlier detection by Isolation Random Forest

In [None]:
def outlier_detection_by_IsolationRandomForest(X, Y, Max_samples=10, 
                                               Random_state = np.random.RandomState(42), 
                                               Contamination = 'auto', 
                                               Behaviour = 'new'):
    clf = IsolationForest(max_samples=Max_samples, 
                          random_state=Random_state, 
                          contamination = Contamination, 
                          behaviour = Behaviour);
    clf.fit(X, Y);
    outliers = np.where(clf.predict(X) != -1);
    X = X[outliers]
    Y = Y[outliers]
    return 0;

In [None]:
outlier_detection_by_IsolationRandomForest(X,Y);
print(X.shape);
print(Y.shape);

> # Divide data set

Dividing data set into training set, validation set, test set. Total = 7032
- training set = 60%
- val set = 20%
- test set = 20%

In [16]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2);
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size=0.25)

In [17]:
print("X train shape = (%d , %d)"%X_train.shape + ", Y train shape = (%d)"%Y_train.shape);
print("X valid shape = (%d , %d)"%X_val.shape + ", Y valid shape = (%d)"%Y_val.shape);
print("X test  shape = (%d , %d)"%X_test.shape + ", Y test  shape = (%d)"%Y_test.shape);

X train shape = (4218 , 45), Y train shape = (4218)
X valid shape = (1407 , 45), Y valid shape = (1407)
X test  shape = (1407 , 45), Y test  shape = (1407)


In [None]:
pk.dump(X_train, open("X_train", "wb"));
pk.dump(Y_train, open("Y_train", "wb"));
pk.dump(X_val, open("X_val", "wb"));
pk.dump(Y_val, open("Y_val", "wb"));
pk.dump(X_test, open("X_test", "wb"));
pk.dump(Y_test, open("Y_test", "wb"));