In [135]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [136]:
import os
path='/content/gdrive/MyDrive/MachineLearning/SVM'
os.chdir(path)

Target based Encoding of the categorical attributes

In [137]:
#to read the data
import pandas as pd
import numpy as np
#for data preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
# for modelling
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC,SVR
# Performance Metrics
from sklearn.metrics import confusion_matrix,classification_report

In [138]:
#read the data
df=pd.read_csv('UnivBank.csv', na_values=['?','#'])

In [139]:
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0.0,0,1.0,0.0,0,0
1,2,45,19,34,90089,3,1.5,1,0.0,0,1.0,0.0,0,0
2,3,39,15,11,94720,1,1.0,1,0.0,0,0.0,0.0,0,0
3,4,35,9,100,94112,1,2.7,2,0.0,0,0.0,,0,0
4,5,35,8,45,91330,4,1.0,2,0.0,0,0.0,0.0,0,1


In [140]:
df.tail()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
4995,4996,29,3,40,92697,1,1.9,3,0.0,0,0.0,0.0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85.0,0,0.0,0.0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0.0,0,0.0,0.0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0.0,0,0.0,0.0,1,0
4999,5000,28,4,83,92612,3,0.8,1,0.0,0,0.0,0.0,1,1


In [141]:
 df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,4998.0,5000.0,4998.0,4999.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.521409,0.096,0.104442,0.060412,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.727873,0.294621,0.305863,0.238273,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [142]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              2
Personal Loan         0
Securities Account    2
CD Account            1
Online                0
CreditCard            0
dtype: int64

In [143]:
X_train,X_test,y_train,y_test=train_test_split(df.loc[:,df.columns!='Personal Loan'],df['Personal Loan'],test_size=0.30,random_state=100)

In [144]:
X_train.shape
X_test.shape

(1500, 13)

In [145]:
y_train.shape

(3500,)

In [146]:
y_test.shape

(1500,)

In [147]:
y_train.value_counts(normalize=True)*100

0    90.8
1     9.2
Name: Personal Loan, dtype: float64

In [148]:
X_train.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Online',
       'CreditCard'],
      dtype='object')

In [149]:
#converting numeric to categorical
dropcols=['ID','ZIP Code']
catcols=['Family','Education', 'Securities Account', 'CD Account', 'Online',
       'CreditCard']
X_train[catcols]=X_train[catcols].astype('category')
X_train.dtypes

ID                       int64
Age                      int64
Experience               int64
Income                   int64
ZIP Code                 int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [150]:
X_test[catcols]=X_test[catcols].astype('category')
X_test.dtypes

ID                       int64
Age                      int64
Experience               int64
Income                   int64
ZIP Code                 int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [151]:
X_train.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              1
Securities Account    1
CD Account            1
Online                0
CreditCard            0
dtype: int64

In [152]:
#dropping the unnecessary columns
X_train=X_train.drop(dropcols,axis=1)
X_test=X_test.drop(dropcols,axis=1)

In [153]:
#imputing na values
imputeMortgage=SimpleImputer(strategy='mean')
imputeSec_CD=SimpleImputer(strategy='most_frequent')

In [154]:
#separating numeric and categorical
X_train_num=X_train.drop(catcols,axis=1)
X_train_cat=X_train[catcols]

In [155]:
X_test_num=X_test.drop(catcols,axis=1)
X_test_cat=X_test[catcols]

In [156]:
#Imputing the train data
X_train_num=pd.DataFrame(imputeMortgage.fit_transform(X_train_num),columns=X_train_num.columns)
X_train_cat=pd.DataFrame(imputeSec_CD.fit_transform(X_train_cat),columns=X_train_cat.columns)

In [157]:
X_train_num.isna().sum()

Age           0
Experience    0
Income        0
CCAvg         0
Mortgage      0
dtype: int64

In [158]:
#imputing the test data
X_test_num=pd.DataFrame(imputeMortgage.transform(X_test_num),columns=X_test_num.columns)
X_test_cat=pd.DataFrame(imputeSec_CD.transform(X_test_cat),columns=X_test_cat.columns)

In [159]:
X_train_num.isna().sum()

Age           0
Experience    0
Income        0
CCAvg         0
Mortgage      0
dtype: int64

In [160]:
X_train_cat.isnull().sum()

Family                0
Education             0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [161]:
X_test_num

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage
0,29.0,3.0,31.0,0.3,0.0
1,59.0,34.0,60.0,2.1,234.0
2,58.0,33.0,23.0,0.2,0.0
3,54.0,29.0,34.0,0.1,0.0
4,56.0,31.0,11.0,0.2,90.0
...,...,...,...,...,...
1495,50.0,24.0,155.0,7.3,0.0
1496,34.0,9.0,134.0,4.6,164.0
1497,52.0,26.0,158.0,3.7,251.0
1498,60.0,35.0,122.0,1.3,0.0


In [162]:
#Standardizing the numeric data
sd=StandardScaler()
X_train_num=pd.DataFrame(sd.fit_transform(X_train_num),columns=X_train_num.columns)
X_test_num=pd.DataFrame(sd.transform(X_test_num),columns=X_test_num.columns)

In [163]:
#Encoding the categorical attributes
ohe=OneHotEncoder(handle_unknown='ignore')
X_train_cat=pd.DataFrame(ohe.fit_transform(X_train_cat))


In [164]:
X_train_cat.head()

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 4)\t1.0\n (0, 7)\t1.0\n ..."
1,"(0, 3)\t1.0\n (0, 4)\t1.0\n (0, 8)\t1.0\n ..."
2,"(0, 3)\t1.0\n (0, 6)\t1.0\n (0, 7)\t1.0\n ..."
3,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 7)\t1.0\n ..."
4,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 7)\t1.0\n ..."
