# Objective: To understand the support vector machines for multi-class classification and regression problems.

## Multiclass classification dataset:

### This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)

### Attribute Information:
1.	Id number: 1 to 214 (removed from CSV file)
2.	RI: refractive index
3.	Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
4.	Mg: Magnesium
5.	Al: Aluminum
6.	Si: Silicon
7.	K: Potassium
8.	Ca: Calcium
9.	Ba: Barium
10.	Fe: Iron

### Target class

Type of glass: (class attribute)
- 1 buildingwindowsfloatprocessed 
- 2 buildingwindowsnonfloatprocessed
- 3 vehiclewindowsfloatprocessed
- 4 vehiclewindowsnonfloatprocessed (none in this database)
- 5 containers
- 6 tableware
- 7 headlamps

## Task 1: Multi-class Support vector machine (SVM) 

In [1]:
# Load the libraries
from sklearn.svm import SVC,SVR
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,classification_report,mean_squared_error

In [None]:
# Load the dataset 
data = pd.read_csv('data/glass.csv')
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [None]:
data.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
 9   Type    214 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB


In [None]:
data.isna().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [None]:
# Preprocessing
# Encoding categorical variables (if any)
# Feature Scaling
# Filling missing values (if any)

In [None]:
X = data.drop(columns = ['Type'], axis = 1)
y = data['Type']

In [None]:
# sanity check
print("X shape : ", X.shape)
print("y shape : ", y.shape)

X shape :  (214, 9)
y shape :  (214,)


In [None]:
# scaling the values
cols = X.columns
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
X = pd.DataFrame(X, columns = cols)
X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,0.432836,0.437594,1.0,0.252336,0.351786,0.009662,0.30855,0.0,0.0
1,0.283582,0.475188,0.801782,0.333333,0.521429,0.077295,0.223048,0.0,0.0
2,0.220808,0.421053,0.790646,0.389408,0.567857,0.062802,0.218401,0.0,0.0
3,0.285777,0.372932,0.821826,0.311526,0.5,0.091787,0.259294,0.0,0.0
4,0.275241,0.381955,0.806236,0.29595,0.583929,0.088567,0.245353,0.0,0.0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# sanity check
print("X train shape : ", X_train.shape)
print("y train shape : ", y_train.shape)
print("X test shape : ", X_test.shape)
print("y test shape : ", y_test.shape)

X train shape :  (149, 9)
y train shape :  (149,)
X test shape :  (65, 9)
y test shape :  (65,)


In [None]:
# Build SVM model 
model=SVC()
model.fit(X_train,y_train)

SVC()

In [None]:
# Evaluate the build model on test dataset
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print('Training Accuracy : ',accuracy_score(y_train, y_pred_train))
print('Testing Accuracy : ',accuracy_score(y_test, y_pred_test))

Training Accuracy :  0.7181208053691275
Testing Accuracy :  0.6461538461538462


In [None]:
# Evaluate training and testing accuracy

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           1       0.62      0.86      0.72        51
           2       0.73      0.75      0.74        53
           3       0.00      0.00      0.00        13
           5       1.00      0.57      0.73         7
           6       1.00      0.33      0.50         6
           7       1.00      0.89      0.94        19

    accuracy                           0.72       149
   macro avg       0.72      0.57      0.61       149
weighted avg       0.69      0.72      0.69       149

              precision    recall  f1-score   support

           1       0.61      0.74      0.67        19
           2       0.57      0.74      0.64        23
           3       0.00      0.00      0.00         4
           5       1.00      0.33      0.50         6
           6       0.00      0.00      0.00         3
           7       0.90      0.90      0.90        10

    accuracy                           0.65        65
   macro avg       0.51

## Task 2: Implement support vector regression (SVR)


In [2]:
# Load training and testing datasets
train = pd.read_csv('Data/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('Data/house-prices-advanced-regression-techniques/test.csv')
data = train.append(test)
data.drop(columns = ['Id'], axis = 1, inplace = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [3]:
data[data.select_dtypes(include='object').columns] = data[data.select_dtypes(include='object').columns].astype('str')

In [4]:
# Apply pre-processing techniques
# Apply feature selection techniques of your choice to reduce the feature set

# label encoding the categorical values
le = LabelEncoder()

for i in train.select_dtypes(include='object').columns:
    data[i] = le.fit_transform(data[i])

In [5]:
# filling nan values by mean
null_cols = list(data.isna().any()[data.isna().any() == True].index)

imp = SimpleImputer()
data[null_cols] = imp.fit_transform(data[null_cols])

In [6]:
cols = list(data.columns)
print(cols)

['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation', 'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenAbvGr', 'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig', 'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SalePrice', 'SaleType', 'ScreenPorch', 'Street', 'Tot

In [7]:
data.describe()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,1159.581706,336.483727,2.602261,1.891059,2.860226,0.505653,2.835903,2.327509,441.423235,49.582248,...,7.491607,16.06235,0.995889,6.451524,1051.777587,0.001713,93.709832,1971.312778,1984.264474,2007.792737
std,392.362079,428.701456,25.188169,0.423503,0.822693,1.206513,0.700631,1.151168,455.53275,169.176615,...,1.593719,56.184365,0.063996,1.569379,440.690726,0.05551,126.526589,30.291442,20.894344,1.314964
min,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1872.0,1950.0,2006.0
25%,876.0,0.0,0.0,2.0,2.0,0.0,3.0,2.0,0.0,0.0,...,8.0,0.0,1.0,5.0,793.0,0.0,0.0,1953.5,1965.0,2007.0
50%,1082.0,0.0,0.0,2.0,3.0,0.0,3.0,3.0,369.0,0.0,...,8.0,0.0,1.0,6.0,990.0,0.0,0.0,1973.0,1993.0,2008.0
75%,1387.5,704.0,0.0,2.0,3.0,0.0,3.0,3.0,733.0,0.0,...,8.0,0.0,1.0,7.0,1302.0,0.0,168.0,2001.0,2004.0,2009.0
max,5095.0,2065.0,508.0,2.0,8.0,4.0,4.0,4.0,5644.0,1526.0,...,9.0,576.0,1.0,15.0,6110.0,2.0,1424.0,2010.0,2010.0,2010.0


In [10]:
# scaling the numerical values
scaler = MinMaxScaler()
X = scaler.fit_transform(data)

data = pd.DataFrame(X, columns = cols)

In [11]:
data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.109641,0.413559,0.0,1.0,0.375,0.0,0.75,0.75,0.125089,0.0,...,0.888889,0.0,1.0,0.461538,0.140098,0.0,0.0,0.949275,0.883333,0.5
1,0.194917,0.0,0.0,1.0,0.375,0.0,0.75,0.25,0.173281,0.0,...,0.888889,0.0,1.0,0.307692,0.206547,0.0,0.20927,0.753623,0.433333,0.25
2,0.123083,0.41937,0.0,1.0,0.375,0.0,0.75,0.5,0.086109,0.0,...,0.888889,0.0,1.0,0.307692,0.150573,0.0,0.0,0.934783,0.866667,0.5
3,0.131695,0.366102,0.0,1.0,0.375,0.0,0.25,0.75,0.038271,0.0,...,0.888889,0.0,1.0,0.384615,0.123732,0.0,0.0,0.311594,0.333333,0.0
4,0.170342,0.509927,0.0,1.0,0.5,0.0,0.75,0.0,0.116052,0.0,...,0.888889,0.0,1.0,0.538462,0.187398,0.0,0.134831,0.927536,0.833333,0.5


In [12]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice',axis=1), data['SalePrice'], test_size=0.3, random_state=42)

In [13]:
# Train SVR model
model=SVR()
model.fit(X_train,y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
# Evaluate training and testing root mean square error
print('Training MSE : ',(mean_squared_error(y_train,model.predict(X_train)))**(0.5))
print('Testing MSE : ',(mean_squared_error(y_test,model.predict(X_test)))**(0.5))

Training MSE :  0.065498284150713
Testing MSE :  0.07011774221772349



## Task 3: Play with various SVM kernels such as polynomial, rbf, sigmoid tanh, etc.


In [15]:
#Play with various SVM kernels such as polynomial, rbf, sigmoid tanh, etc.

In [16]:
model=SVR(kernel='poly')
model.fit(X_train,y_train)
print('Training MSE : ',(mean_squared_error(y_train,model.predict(X_train)))**(0.5))
print('Testing MSE : ',(mean_squared_error(y_test,model.predict(X_test)))**(0.5))

Training MSE :  0.06618273727775908
Testing MSE :  0.06938689798026035




In [17]:
model=SVR(kernel='rbf')
model.fit(X_train,y_train)
print('Training MSE : ',(mean_squared_error(y_train,model.predict(X_train)))**(0.5))
print('Testing MSE : ',(mean_squared_error(y_test,model.predict(X_test)))**(0.5))

Training MSE :  0.065498284150713
Testing MSE :  0.07011774221772349




In [18]:
model=SVR(kernel='sigmoid')
model.fit(X_train,y_train)
print('Training MSE : ',(mean_squared_error(y_train,model.predict(X_train)))**(0.5))
print('Testing MSE : ',(mean_squared_error(y_test,model.predict(X_test)))**(0.5))

Training MSE :  0.06492050687735068
Testing MSE :  0.06884923542555596


