In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing libraries.

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy import stats
from sklearn.metrics import accuracy_score, roc_curve, auc, mean_squared_error, f1_score
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'imblearn'

# Reading the data.

In [None]:
!ls ../input/santander-customer-satisfaction/

In [None]:
train = pd.read_csv("../input/santander-customer-satisfaction/train.csv")
test = pd.read_csv("../input/santander-customer-satisfaction/test.csv")
sample = pd.read_csv("../input/santander-customer-satisfaction/sample_submission.csv")

# Preprocessing.

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample.head()

In [None]:
print("Shape of train dataframe is {}".format(train.shape))
print("Shape of test dataframe is {}".format(test.shape))
print("Shape of sample dataframe is {}".format(sample.shape))

In [None]:
print('Null values in training data is {}'.format(train.isnull().sum().any()))
print('Null values in testing data is {}'.format(test.isnull().sum().any()))

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# Findning duplicate rows.

dupli = train[train.duplicated()]
dupli

*No duolicate rows found.*

In [None]:
# Removing all the columns having only 1 unique value.

for i in train.columns:
    if train[i].nunique() == 1:
        print(i)
        train.drop(i, inplace = True, axis = 1)
        test.drop(i, inplace = True, axis = 1)

In [None]:
train.shape

In [None]:
# Before removing Outliers.

plt.figure(figsize = (20,20))
for i in range (50):
    plt.subplot(5, 10, i+1)
    sns.boxplot(train.iloc[:,i])
    plt.xlabel(train.columns[i], size = 10)

In [None]:
def Outliers(data, ft):
    
    IQ1 = data[ft].quantile(0.25)
    IQ3 = data[ft].quantile(0.75)
    IQR = IQ3 - IQ1
    
    lower_bound = IQ1 - 1.5 * IQR
    upper_bound = IQ3 + 1.5 * IQR
    
    index = data.index[ (data[ft] < lower_bound) | (data[ft] > upper_bound) ]
    return index

In [None]:
index = []
for i in train.columns:
    index.extend(Outliers(train, i))
index = list(set(index))
len(index)

In [None]:
# print("Size of training data before removing outliers is {}".format(train.shape))
# train.drop(index, inplace = True, axis = 0)
# print("Size of training data after removing outliers is {}".format(train.shape))

*Here I tried removing outlier, but by removing them I was loosing important information as well. That's why I decided not to remove outliers.*

In [None]:
# Count plot for target column.

print(train['TARGET'].value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = train['TARGET'])
plt.xlabel('Target', size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution in target column before resmpling', size = 12)

*Here we can see that target column is unbalanced.*

In [None]:
test_id = test['ID']
train.drop('ID', inplace = True, axis = 1)
test.drop('ID', inplace = True, axis = 1)

In [None]:
x = train.drop('TARGET', axis = 1)
x.head()

In [None]:
y = train.loc[:, 'TARGET']
y.head()

In [None]:
# Sampling the data, to balance the classes in target column.

sampler = RandomUnderSampler()
x, y = sampler.fit_resample(x, y)

In [None]:
print(y.value_counts())
plt.figure(figsize = (8,5))
sns.countplot(x = y)
plt.xlabel('Target', size = 12)
plt.ylabel('Count', size = 12)
plt.title('Distribution in target column before resmpling', size = 12)

In [None]:
# Applying standard scaler and min max scaler.

col = x.columns
std = StandardScaler()
x_std = std.fit_transform(x)
x_std = pd.DataFrame(data = x_std, columns = col)

mms = MinMaxScaler()
x_mms = mms.fit_transform(x)
x_mms = pd.DataFrame(data = x_mms, columns = col)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.25, random_state = 42)

# Building models.

In [None]:
def Models(model, name, d, xtrain, ytrain, xtest, ytest):
    print("Working on {} model".format(name))
    
    cla = model
    cla.fit(xtrain, ytrain)
    
    predicted = cla.predict(xtrain)
    tr_auc = accuracy_score(predicted, ytrain)*100
    
    predicted = cla.predict(xtest)
    te_auc = accuracy_score(predicted, ytest)*100
    
    F1 = f1_score(predicted, ytest)
    fpr, tpr, threshould = roc_curve(predicted, ytest)
    AUC = auc(fpr, tpr)
    
    d['Name'].append(name)
    d['Training ACU'].append(tr_auc)
    d['Testing ACU'].append(te_auc)
    d['F1'].append(F1)
    d['AUC'].append(AUC)
    
    print("**********"*5)
    print()
    return d

In [None]:
l = [x, x_std, x_mms]
final = []
for i in l:
    xtrain, xtest, ytrain, ytest = train_test_split(i, y, test_size = 0.25, random_state = 42)
    
    d = {'Name' : [], 'Training ACU': [], 'Testing ACU': [], 'F1': [], 'AUC': []}
    
    models = [ 
        [RandomForestClassifier(n_estimators = 200), 'Random Forest'], [DecisionTreeClassifier(), 'Decision Tree'], 
              [XGBClassifier(tree_method='gpu_hist'), 'XGBoost'], 
              [CatBoostClassifier(task_type="GPU"), 'CatBoost'], [GaussianNB(), 'Naive Bayes'], 
              [LogisticRegression(), 'Logistic Regression'] 
    ]

    for model in models:
        d = Models(model[0], model[1], d, xtrain, ytrain, xtest, ytest)
    final.append(d)

In [None]:
name = ['Normal', 'Standard', 'Min Max']
for i in range (len(name)):
    print(name[i])
    acu_data = pd.DataFrame(data = final[i])
    print(acu_data)
    print("******"*12)

*Here we are choosing catboost with standard deviation since its giving best score.*

# Making predictions on test data.

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x_std, y, test_size = 0.25, random_state = 42)

In [None]:
cla = CatBoostClassifier(task_type = 'GPU')
cla.fit(xtrain, ytrain, verbose = 100)

In [None]:
x_test = test.loc[:, :]
col = x_test.columns
x_test = std.transform(x_test)
x_test = pd.DataFrame(data = x_test, columns = col)
x_test.head()

In [None]:
predicted = cla.predict(x_test)
predicted

In [None]:
sample

In [None]:
submit = pd.DataFrame(data = {'ID' : test_id, 'TARGET' :predicted})
submit

In [None]:
submit.to_csv('Submission1.csv', index = False)