In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import os
path='/content/gdrive/MyDrive/MachineLearning/SVM'
os.chdir(path)

In [None]:
#to read the data
import pandas as pd
import numpy as np
#for data preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
# for modelling
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC,SVR
# Performance Metrics
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [None]:
df=pd.read_csv('UnivBank.csv', na_values=['?','#'])

In [None]:
df.head()
df.shape

(5000, 14)

In [None]:
df.tail()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
4995,4996,29,3,40,92697,1,1.9,3,0.0,0,0.0,0.0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85.0,0,0.0,0.0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0.0,0,0.0,0.0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0.0,0,0.0,0.0,1,0
4999,5000,28,4,83,92612,3,0.8,1,0.0,0,0.0,0.0,1,1


In [None]:
 df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,4998.0,5000.0,4998.0,4999.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.521409,0.096,0.104442,0.060412,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.727873,0.294621,0.305863,0.238273,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [None]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              2
Personal Loan         0
Securities Account    2
CD Account            1
Online                0
CreditCard            0
dtype: int64

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df.loc[:,df.columns!='Income'],df['Income'],test_size=0.30,random_state=100)

In [None]:
X_train.shape
# X_test.shape

(3500, 13)

In [None]:
y_train.shape

(3500,)

In [None]:
y_test.shape

(1500,)

In [None]:
y_train.value_counts(normalize=True)*100

38     1.800000
81     1.714286
44     1.714286
40     1.600000
28     1.571429
         ...   
192    0.085714
189    0.057143
202    0.057143
203    0.028571
224    0.028571
Name: Income, Length: 159, dtype: float64

In [None]:
X_train.columns

Index(['ID', 'Age', 'Experience', 'ZIP Code', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [None]:
#converting numeric to categorical
dropcols=['ID','ZIP Code']
catcols=['Family','Personal Loan', 'Securities Account', 'CD Account', 'Online',
       'CreditCard','Education']
X_train[catcols]=X_train[catcols].astype('category')
X_train.dtypes

ID                       int64
Age                      int64
Experience               int64
ZIP Code                 int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Personal Loan         category
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [None]:
X_test[catcols]=X_test[catcols].astype('category')
X_test.dtypes

ID                       int64
Age                      int64
Experience               int64
ZIP Code                 int64
Family                category
CCAvg                  float64
Education             category
Mortgage               float64
Personal Loan         category
Securities Account    category
CD Account            category
Online                category
CreditCard            category
dtype: object

In [None]:
X_train.isnull().sum()

ID                    0
Age                   0
Experience            0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              1
Personal Loan         0
Securities Account    1
CD Account            1
Online                0
CreditCard            0
dtype: int64

In [None]:
X_train=X_train.drop(dropcols,axis=1)
X_test=X_test.drop(dropcols,axis=1)

In [None]:
imputeMortgage=SimpleImputer(strategy='mean')
imputeSec_CD=SimpleImputer(strategy='most_frequent')

In [None]:
X_train_num=X_train.drop(catcols,axis=1)
X_train_cat=X_train[catcols]

In [None]:
X_test_num=X_test.drop(catcols,axis=1)
X_test_cat=X_test[catcols]

In [None]:
#Imputing the train data
X_train_num=pd.DataFrame(imputeMortgage.fit_transform(X_train_num),columns=X_train_num.columns)
X_train_cat=pd.DataFrame(imputeSec_CD.fit_transform(X_train_cat),columns=X_train_cat.columns)

In [None]:
X_train_num.isnull().sum()
X_train_cat.head()

Unnamed: 0,Family,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4.0,0.0,1.0,0.0,0.0,0.0,1.0
2,4.0,0.0,0.0,0.0,1.0,0.0,3.0
3,1.0,1.0,0.0,1.0,1.0,1.0,3.0
4,1.0,0.0,0.0,0.0,0.0,0.0,3.0


In [None]:
X_train_cat.isnull().sum()
X_train_cat.count()
y_train.count()

3500

In [None]:
#imputing the test data
X_test_num=pd.DataFrame(imputeMortgage.transform(X_test_num),columns=X_test_num.columns)
X_test_cat=pd.DataFrame(imputeSec_CD.transform(X_test_cat),columns=X_test_cat.columns)

In [None]:
X_test_num.isnull().sum()


Age           0
Experience    0
CCAvg         0
Mortgage      0
dtype: int64

In [None]:
X_test_cat.isnull().sum()

Family                0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
Education             0
dtype: int64

In [None]:
#Standardizing the numeric data
sd=StandardScaler()
X_train_num=pd.DataFrame(sd.fit_transform(X_train_num),columns=X_train_num.columns)
X_test_num=pd.DataFrame(sd.transform(X_test_num),columns=X_test_num.columns)


In [None]:
pip install category-encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category-encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 324 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
from category_encoders import TargetEncoder
encoder=TargetEncoder()
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train_cat = encoder.fit_transform(X_train_cat,y_train)
X_test_cat=encoder.transform(X_test_cat,y_test)





In [None]:
# #Encoding the categorical attributes
# ohe=OneHotEncoder(handle_unknown='ignore')
# # column=ohe.get_feature_names(X_train_cat.columns
# X_train_cat=pd.DataFrame(ohe.fit_transform(X_train_cat).todense(),columns=ohe.get_feature_names_out())
# X_test_cat=pd.DataFrame(ohe.transform(X_test_cat).todense(),columns=ohe.get_feature_names_out())


In [None]:
X_train_cat.head()

Unnamed: 0,Family,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4.0,0.0,1.0,0.0,0.0,0.0,1.0
2,4.0,0.0,0.0,0.0,1.0,0.0,3.0
3,1.0,1.0,0.0,1.0,1.0,1.0,3.0
4,1.0,0.0,0.0,0.0,0.0,0.0,3.0


In [None]:
Xtrain=pd.concat([X_train_num,X_train_cat],axis=1)
Xtrain.shape
# y_train.shape
Xtrain.dtypes

Age                   float64
Experience            float64
CCAvg                 float64
Mortgage              float64
Family                float64
Personal Loan         float64
Securities Account    float64
CD Account            float64
Online                float64
CreditCard            float64
Education             float64
dtype: object

In [None]:
Xtest=pd.concat([X_test_num,X_test_cat],axis=1)

In [None]:
model=SVR()
param_grid={'C':[1.0,1.5,2.0], 'kernel':['rbf','sigmoid','poly'], 'gamma': [1.0,0.1,0.01]}
search=GridSearchCV(model,param_grid,cv=5)
search.fit(Xtrain,y_train)


GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [1.0, 1.5, 2.0], 'gamma': [1.0, 0.1, 0.01],
                         'kernel': ['rbf', 'sigmoid', 'poly']})

In [None]:
print(search.best_params_)


{'C': 1.0, 'gamma': 1.0, 'kernel': 'poly'}


In [None]:
model=SVR(C=1.0,kernel='poly',gamma=1.0)
svr=model.fit(Xtrain,y_train)

In [None]:
svrtrain=model.predict(Xtrain)
svrtest=model.predict(Xtest)

In [None]:

print(mean_absolute_percentage_error(svrtrain,y_train))
# print(confusion_matrix(svrtest,y_test))

0.38578710155488777
