# **Which customers are happy customers?**

![](https://icon2.cleanpng.com/20180320/lew/kisspng-customer-satisfaction-customer-service-net-promote-happy-customers-icon-5ab1956e072b84.2928019215215875660294.jpg)

You are provided with an anonymized dataset containing a large number of numeric variables. The "TARGET" column is the variable to predict. It equals one for unsatisfied customers and 0 for satisfied customers.

The task is to predict the probability that each customer in the test set is an unsatisfied customer.

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgUdWNHQejHsxV3IWvCKsCtNHp5T3HI_Ai8Q&usqp=CAU)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv("../input/santander-customer-satisfaction/train.csv")

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
test=pd.read_csv("../input/santander-customer-satisfaction/test.csv")

In [None]:
test_ox=test.copy()

In [None]:
test.head()

In [None]:
train.head()

In [None]:
train.columns

In [None]:
train.isnull().sum()

In [None]:
train.drop(['ID'],axis=1,inplace=True)

In [None]:
train.columns

In [None]:
train['TARGET'].value_counts()

In [None]:
categories=[]
for col in train.columns:
    if train[col].dtypes==int:
        categories.append(col)

In [None]:
floats=[]
for col in train.columns:
    if train[col].dtypes==float:
        floats.append(col)

In [None]:
len(floats)

In [None]:
len(categories)

In [None]:
y=train['TARGET']

In [None]:
X=train.iloc[:,:370]

In [None]:
X.head()

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X, y)
print('Origianl dataset shape:', Counter(y))
print('Resample dataset shape:', Counter(y_smote))

In [None]:
columns=x_smote.columns
ind=x_smote.index

In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
sc=StandardScaler()

In [None]:
test.drop(['ID'],axis=1,inplace=True)

In [None]:
test_columns=test.columns

In [None]:
test_index=test.index

In [None]:
test=sc.fit_transform(test)

In [None]:
test=pd.DataFrame(test,columns=test_columns,index=test_index)

In [None]:
test.head()

In [None]:
x_smote=sc.fit_transform(x_smote)

In [None]:
x_smote=pd.DataFrame(x_smote,columns=columns,index=ind)

In [None]:
x_smote.head()

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,random_state=1,shuffle=True)
for train_index,test_index in kf.split(x_smote,y_smote):
    xtr,xvl = x_smote.iloc[train_index],x_smote.iloc[test_index]
    ytr,yvl = y_smote.iloc[train_index],y_smote.iloc[test_index]

In [None]:
xtr.head()

Principal Component Analysis, or PCA, is a dimensionality-reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set.

Reducing the number of variables of a data set naturally comes at the expense of accuracy, but the trick in dimensionality reduction is to trade a little accuracy for simplicity. Because smaller data sets are easier to explore and visualize and make analyzing data much easier and faster for machine learning algorithms without extraneous variables to process.

So to sum up, the idea of PCA is simple — reduce the number of variables of a data set, while preserving as much information as possible.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=57)
pca_train_data = pca.fit_transform(xtr)
pca_test_data=pca.fit_transform(xvl)

print(pca_train_data.shape,'\n')

explained_variance = pca.explained_variance_ratio_ 
print(explained_variance)

In [None]:
test.shape

In [None]:
pca_test=pca.fit_transform(test)

In [None]:
pca_test.shape

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV ,StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from pandas_profiling import ProfileReport
from sklearn.feature_selection import RFECV
import operator
import warnings
warnings.simplefilter(action = "ignore")

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
#models.append(('SVM', SVC(gamma='auto')))
models.append(('XGB', GradientBoostingClassifier()))
models.append(("LightGBM", LGBMClassifier()))

# evaluate each model in turn
results = []
names = []

In [None]:
def metrics(true,predicted):
    return roc_auc_score(true,predicted)

In [None]:
for name, model in models:
    
        model.fit(pca_train_data,ytr)
        pred=model.predict(pca_test_data)
        res=metrics(yvl,pred)
        results.append(res)
        names.append(name)
        msg = "%s: %f " % (name,res)
        print(msg)

In [None]:
sc = RandomizedSearchCV(
        estimator=RandomForestClassifier(),
        param_distributions={
            'n_estimators': [int(x) for x in np.linspace(start=1,stop=50,num=10)],
            'criterion' : ['gini', 'entropy'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_split': range(1,10),
            'min_samples_leaf' : range(1,10),
            'max_depth': [int(x) for x in np.linspace(start=1,stop=30,num=2)],
            
        },
         scoring='roc_auc',cv=15, verbose=0,n_jobs=-1,random_state=1)
    
grid_res = sc.fit(pca_train_data, ytr)
best_params = grid_res.best_params_

In [None]:
random=RandomForestClassifier(n_estimators=best_params['n_estimators'],criterion=best_params['criterion'],max_features=best_params['max_features'],min_samples_split=best_params['min_samples_split'],min_samples_leaf=best_params['min_samples_leaf'],max_depth=best_params['max_depth'],random_state=1,verbose=0,n_jobs=-1)

In [None]:
random.fit(pca_train_data,ytr)

In [None]:
r_pre=random.predict(pca_test_data)

In [None]:
metrics(yvl,r_pre)

In [None]:
r_predict=random.predict(pca_test)

In [None]:
import plotly.express as px

In [None]:

#components = pca.fit_transform(df[features])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    pca_train_data,
    labels=labels,
    dimensions=range(4),
    color=ytr
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
#pca = PCA()
#components = pca.fit_transform(df[features])
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    pca_test_data,
    labels=labels,
    dimensions=range(4),
    color=yvl
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
import lightgbm as lgb

In [None]:
d_train=lgb.Dataset(pca_train_data,label=ytr)
params={}
params['learning_rate']=0.03
params['objective']='binary'
params['boosting_type']='gbdt'
params['metric']='binary_logloss'
params['sub_feature']=0.5
params['num_leaves']=50
params['min_data']=50
params['max_depth']=50
clf=lgb.train(params,d_train,100)

for max_depth=10,num_leaves=10
score=0.6804436345667506
for max_depth=50,num_leaves=50
score=0.7003318913308098

In [None]:
y_pred=clf.predict(pca_test_data)

In [None]:
for i in range(0,len(y_pred)):
    if y_pred[i]>=0.5:
        y_pred[i]=1
    else:
        y_pred[i]=0

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(yvl,y_pred)
cm

In [None]:
metrics(yvl,y_pred)

In [None]:
y_predict=clf.predict(pca_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,yvl)
accuracy

In [None]:
final=y_predict*0.5+(0.4*r_predict)

In [None]:
test_ox['TARGET']=final

In [None]:
pd.DataFrame(test_ox, columns=['ID','TARGET']).to_csv('exp.csv',mode = 'w', index=False)