# Bhargavi Pasam: Santander Customer Satisfaction Assignment

In [79]:
# Do imports first
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#For saving files
from numpy import asarray
from numpy import savetxt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler

# Read the file and examine its columns

In [35]:
#Read the file and examine columns
train = pd.read_csv('../input/santander-customer-satisfaction/train.csv')
train.head(5)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


# See how big the data set is: Rows, Columns

In [36]:
# Look to see how many rows and columns are there
print (train.shape)

(76020, 371)


# Understand the uniqueness of data across the rows for each column

In [37]:
#Uniqueness: Feature engineering
#For every column how many unique values are there amongh the rows
unique_numbers = train.nunique()
print (unique_numbers)

print (type(unique_numbers))
print ("One of the unique numbers")
print (unique_numbers[0])

# ID                         76020
# var3                         208
# var15                        100
# imp_ent_var16_ult1           596
# imp_op_var39_comer_ult1     7551
#                           ...  
# saldo_medio_var44_hace3       33
# saldo_medio_var44_ult1       141
# saldo_medio_var44_ult3       141
# var38                      57736
# TARGET                         2
# Length: 371, dtype: int64

ID                         76020
var3                         208
var15                        100
imp_ent_var16_ult1           596
imp_op_var39_comer_ult1     7551
                           ...  
saldo_medio_var44_hace3       33
saldo_medio_var44_ult1       141
saldo_medio_var44_ult3       141
var38                      57736
TARGET                         2
Length: 371, dtype: int64
<class 'pandas.core.series.Series'>
One of the unique numbers
76020


# Drop Columns whose rows contain the same value

In [38]:
#Drop all columns (features) that has only one value
#hence doesnt change among the rows

#Drop all those features which is containing a single value
newdf = train.copy()
for i in newdf.columns:
    if newdf[i].nunique() == 1:
        newdf.drop([i], axis=1, inplace=True)

print(newdf.shape)

#Went from 371 to 337 columns

(76020, 337)


# Split the training data set

In [40]:
#Remove Target column to prepare the "input data set"
#Also drop the ID column that doesn't play a role
X = newdf.drop(['TARGET','ID'], axis=1)

#Extract the output in a Y vector
y = newdf['TARGET']

#Use train test split method that was imported
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

#Print the set dimensions
print ("X_train:", X_train.shape)
print ("X_test:", X_test.shape)
print ("Y_train:", y_train.shape)
print ("Y_test:", y_test.shape)

# Results
# X_train: (60816, 335)
# X_test: (15204, 335)
# Y_train: (60816,)
# Y_test: (15204,)

X_train: (60816, 335)
X_test: (15204, 335)
Y_train: (60816,)
Y_test: (15204,)


# Sampling related: Over and Under

In [48]:
#Get the two features from the feature having max number of unique values
most_varying_feature1 = unique_numbers.sort_values(ascending=False).index[1]
most_varying_feature2 = unique_numbers.sort_values(ascending=False).index[2]

print ("Most varying feature 1: ", most_varying_feature1)
print ("Most varying feature 2: ",most_varying_feature2)

#Checking for data imbalace
print ("")
print('Data imbalance before sampling: ')
print('Number of zeros: ', y_train.value_counts()[0],'\nNumber of ones: ', y_train.value_counts()[1])
print('Shape of x_train before sampling', X_train.shape,'\n\n\n')

#Oversampling the minoiry class
sm = SMOTE(sampling_strategy=0.2, random_state=42)
X_over, y_over = sm.fit_resample(X_train, y_train)

print('Data imbalance after oversampling (SMOTE): ')
print('Number of zeros: ', y_over.value_counts()[0],'\nNumber of ones: ', y_over.value_counts()[1])
print('Shape of x_train after sampling', X_over.shape,'\n\n\n')

#Undersampling
# undersample = NearMiss(version=3, n_neighbors_ver3=3)
undersample = RandomUnderSampler(random_state=42)
X_under, y_under = undersample.fit_resample(X_over,y_over)

print('Data imbalance after final undersampling (random undersampler): ')
print('Number of zeros: ', y_under.value_counts()[0],'\nNumber of ones: ', y_under.value_counts()[1])
print('Shape of x_train after sampling', X_under.shape,'\n\n\n')

Most varying feature 1:  var38
Most varying feature 2:  saldo_medio_var5_ult3

Data imbalance before sampling: 
Number of zeros:  58415 
Number of ones:  2401
Shape of x_train before sampling (60816, 335) 



Data imbalance after oversampling (SMOTE): 
Number of zeros:  58415 
Number of ones:  11683
Shape of x_train after sampling (70098, 335) 



Data imbalance after final undersampling (random undersampler): 
Number of zeros:  11683 
Number of ones:  11683
Shape of x_train after sampling (23366, 335) 





# Decision Tree: Fit the model

In [54]:
final_X_train = X_under
final_Y_train = y_under

print ("Final Arrays input and output")
print ("X: ", final_X_train.shape)
print ("Y: ", final_Y_train.shape)
print ("")

print ("Fit the model\n")
clf_model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=7, min_samples_split=5)
clf_model.fit(final_X_train,final_Y_train)

from sklearn.metrics import accuracy_score

pred_train = clf_model.predict(final_X_train)

print("Accuracy when tested against the training data itself")
print(accuracy_score(final_Y_train, pred_train))

Final Arrays input and output
X:  (23366, 335)
Y:  (23366,)

Fit the model

Accuracy when tested against the training data itself
0.8458443892835744


# Lets test against the "Split Test Data"

In [65]:
#The test split is in X_test
print (X_test.shape)
print (y_test.shape)

pred_test_results = clf_model.predict(X_test)

print("Accuracy when tested against the split testing data")
print(accuracy_score(y_test, pred_test_results))

(15204, 335)
(15204,)
Accuracy when tested against the split testing data
0.835438042620363


# Read the real test data to predict the Target

In [59]:
#Read test data and print a few lines

test_data = pd.read_csv('../input/santander-customer-satisfaction/test.csv')
test_data.head(5)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,5,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,6,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,7,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,9,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


# Drop some columns from test data as was done for training data

In [70]:
#Drop all columns (features) that has only one value
#hence doesnt change among the rows

#Drop all those features which is containing a single value
#already copied
newdf1 = train.copy()
for i in newdf1.columns:
    if newdf1[i].nunique() == 1:
        test_data.drop([i], axis=1, inplace=True)

print(test_data.shape)

#Went from 371 to 337 columns

(75818, 336)


# Remove the ID Field

In [71]:
#drop the ID field
test_data_with_out_id = test_data.drop(["ID"],axis=1)
print ("\nNew columns")
test_data_with_out_id.head(5)


New columns


Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


# Make sure final X train also look similar in its columns structure

In [61]:
final_X_train.head(5)

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113644.38
1,2,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
2,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68427.15
3,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48179.01
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,256264.86


# Run it against the test data that has no results. Actual test

In [77]:
#The test split is in X_test
print (test_data_with_out_id.shape)

pred_y = clf_model.predict(test_data_with_out_id)

print("Predicted Y")
Output, counts = np.unique(pred_y, return_counts=True)
print (Output)
print (counts)
dict(zip(Output, counts))

(75818, 335)
Predicted Y
[0 1]
[62610 13208]


{0: 62610, 1: 13208}

# Save the results to a file

In [86]:
ID_vector = test_data["ID"]
print (ID_vector)
#savetxt('santander-customer-satisfaction-try-1.csv', pred_y, delimiter=',')

pred_dataframe_array = []
for i in zip(ID_vector, pred_y) :
      pred_dataframe_array.append(i)

pred_dataframe = pd.DataFrame(pred_dataframe_array , columns = ['ID' , 'TARGET'])
pred_dataframe.head(5)

pred_dataframe.to_csv("santander-customer-satisfaction-try-2.csv", index = None)

0             2
1             5
2             6
3             7
4             9
          ...  
75813    151831
75814    151832
75815    151833
75816    151834
75817    151837
Name: ID, Length: 75818, dtype: int64
