### Support Vector Machines (SVM)

Here we will be going throught the regression aspect of the SVM. SVM is a type of supervised machine learning model used for classification and regression tasks. The primary goal of SVM is to find a hyperplane that best separates the data into different classes.

In [2]:
import pandas as pd

earthquake = pd.read_csv('Earthquake_data_dataset.csv')
earthquake.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,country
0,"M 7.0 - 18 km SW of Malango, Solomon Islands",7.0,22-11-2022 02:03,8,7,1,768,us,117,0.509,17.0,mww,14.0,-9.7963,159.596,Solomon Islands
1,"M 6.9 - 204 km SW of Bengkulu, Indonesia",6.9,18-11-2022 13:37,4,4,0,735,us,99,2.229,34.0,mww,25.0,-4.9559,100.738,
2,M 7.0 -,7.0,12-11-2022 07:09,3,3,1,755,us,147,3.125,18.0,mww,579.0,-20.0508,-178.346,Fiji
3,"M 7.3 - 205 km ESE of Neiafu, Tonga",7.3,11-11-2022 10:48,5,5,1,833,us,149,1.865,21.0,mww,37.0,-19.2918,-172.129,
4,M 6.6 -,6.6,09-11-2022 10:14,0,2,1,670,us,131,4.998,27.0,mww,624.464,-25.5948,178.278,


In [3]:
earthquake.shape

(782, 16)

In [4]:
earthquake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      782 non-null    object 
 1   magnitude  782 non-null    float64
 2   date_time  782 non-null    object 
 3   cdi        782 non-null    int64  
 4   mmi        782 non-null    int64  
 5   tsunami    782 non-null    int64  
 6   sig        782 non-null    int64  
 7   net        782 non-null    object 
 8   nst        782 non-null    int64  
 9   dmin       782 non-null    float64
 10  gap        782 non-null    float64
 11  magType    782 non-null    object 
 12  depth      782 non-null    float64
 13  latitude   782 non-null    float64
 14  longitude  782 non-null    float64
 15  country    484 non-null    object 
dtypes: float64(6), int64(5), object(5)
memory usage: 97.9+ KB


In [5]:
earthquake.describe()

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude
count,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0
mean,6.941125,4.33376,5.964194,0.388747,870.108696,230.250639,1.325757,25.03899,75.883199,3.5381,52.609199
std,0.445514,3.169939,1.462724,0.487778,322.465367,250.188177,2.218805,24.225067,137.277078,27.303429,117.898886
min,6.5,0.0,1.0,0.0,650.0,0.0,0.0,0.0,2.7,-61.8484,-179.968
25%,6.6,0.0,5.0,0.0,691.0,0.0,0.0,14.625,14.0,-14.5956,-71.66805
50%,6.8,5.0,6.0,0.0,754.0,140.0,0.0,20.0,26.295,-2.5725,109.426
75%,7.1,7.0,7.0,1.0,909.75,445.0,1.863,30.0,49.75,24.6545,148.941
max,9.1,9.0,9.0,1.0,2910.0,934.0,17.654,239.0,670.81,71.6312,179.662


In [6]:
earthquake.isnull().sum()

title          0
magnitude      0
date_time      0
cdi            0
mmi            0
tsunami        0
sig            0
net            0
nst            0
dmin           0
gap            0
magType        0
depth          0
latitude       0
longitude      0
country      298
dtype: int64

In [7]:
earthquake.drop(['country', 'date_time', 'net', 'title'], axis = 1, inplace = True)

In [8]:
earthquake.head()

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,magType,depth,latitude,longitude
0,7.0,8,7,1,768,117,0.509,17.0,mww,14.0,-9.7963,159.596
1,6.9,4,4,0,735,99,2.229,34.0,mww,25.0,-4.9559,100.738
2,7.0,3,3,1,755,147,3.125,18.0,mww,579.0,-20.0508,-178.346
3,7.3,5,5,1,833,149,1.865,21.0,mww,37.0,-19.2918,-172.129
4,6.6,0,2,1,670,131,4.998,27.0,mww,624.464,-25.5948,178.278


In [9]:
earthquake = pd.get_dummies(earthquake, columns = ['magType'])

### Detecting and Removing Outliers

In [26]:
minimum_threshold = earthquake['magnitude'].quantile(0.05)

earthquake[earthquake['magnitude'] < minimum_threshold]

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude,magType_Mi,magType_mb,magType_md,magType_ml,magType_ms,magType_mw,magType_mwb,magType_mwc,magType_mww


In [19]:
maximum_threshold = earthquake['gap'].quantile(0.95)
earthquake[earthquake['gap'] > maximum_threshold]

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude,magType_Mi,magType_mb,magType_md,magType_ml,magType_ms,magType_mw,magType_mwb,magType_mwc,magType_mww
8,6.8,8,7,1,1179,175,2.137,92.0,20.0,18.33,-102.913,0,0,0,0,0,0,0,0,1
9,7.6,9,8,1,1799,271,1.153,69.0,26.943,18.3667,-103.252,0,0,0,0,0,0,0,0,1
21,6.6,6,5,1,762,0,0.914,94.0,27.0,11.5538,-86.9919,0,0,0,0,0,0,0,0,1
30,6.5,7,4,1,651,0,1.088,57.0,8.0,-29.535,-176.729,0,0,0,0,0,0,0,0,1
32,6.5,0,3,1,650,97,1.61607,108.0,37.0,52.502,-168.08,1,0,0,0,0,0,0,0,0
33,6.5,0,3,1,650,23,0.0,208.8,37.0,52.502,-168.08,1,0,0,0,0,0,0,0,0
35,6.7,0,4,1,691,50,0.936943,126.0,33.0,52.48,-167.736,1,0,0,0,0,0,0,0,0
36,6.7,0,4,1,691,20,0.0,205.2,33.0,52.48,-167.736,1,0,0,0,0,0,0,0,0
37,6.8,7,6,1,730,0,0.92,61.0,20.0,52.6563,-167.917,0,0,0,0,0,0,0,0,1
47,6.5,7,5,1,684,0,0.993,114.0,21.0,12.1598,-87.8542,0,0,0,0,0,0,0,0,1


In [20]:
earthquake_no_outliers = earthquake[(earthquake['gap'] > minimum_threshold) & (earthquake['gap'] < maximum_threshold)]
earthquake_no_outliers

Unnamed: 0,magnitude,cdi,mmi,tsunami,sig,nst,dmin,gap,depth,latitude,longitude,magType_Mi,magType_mb,magType_md,magType_ml,magType_ms,magType_mw,magType_mwb,magType_mwc,magType_mww
0,7.0,8,7,1,768,117,0.509,17.0,14.000,-9.7963,159.596,0,0,0,0,0,0,0,0,1
1,6.9,4,4,0,735,99,2.229,34.0,25.000,-4.9559,100.738,0,0,0,0,0,0,0,0,1
2,7.0,3,3,1,755,147,3.125,18.0,579.000,-20.0508,-178.346,0,0,0,0,0,0,0,0,1
3,7.3,5,5,1,833,149,1.865,21.0,37.000,-19.2918,-172.129,0,0,0,0,0,0,0,0,1
4,6.6,0,2,1,670,131,4.998,27.0,624.464,-25.5948,178.278,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,7.0,0,7,0,754,690,0.000,19.1,68.000,38.8490,141.568,0,0,0,0,0,0,1,0,0
722,6.8,0,9,0,711,534,0.000,19.4,12.000,36.9640,3.634,0,0,0,0,0,0,0,1,0
723,6.5,0,7,0,650,247,0.000,28.7,33.000,-8.2940,120.743,0,0,0,0,0,0,0,1,0
724,6.8,0,7,0,711,431,0.000,38.4,40.200,-4.6940,153.238,0,0,0,0,0,0,0,1,0


In [21]:
x = earthquake_no_outliers.iloc[:, 1:]
y = earthquake_no_outliers.iloc[:, 0]

In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [23]:
from sklearn.svm import SVR

model = SVR()
model.fit(x_train, y_train)

In [24]:
y_pred = model.predict(x_test)

In [25]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'The mean squared error is {mse}')
print(f'The r squared value is {r2}')

The mean squared error is 0.07026717869669989
The r squared value is 0.5138673151321583


In [38]:
predictions = pd.DataFrame({'Actual Values': y_test, 'Predicted Values': y_pred})
predictions

Unnamed: 0,Actual Values,Predicted Values
596,6.5,6.708064
588,6.7,6.715540
208,6.5,6.601125
291,7.0,6.891346
174,6.9,6.890633
...,...,...
79,6.9,6.839076
148,6.8,6.792730
334,6.7,6.654785
204,6.8,7.163239


### Parameters Tuning

In this stage, we will be adjusting the parameters in order to improve the model metrics. Note that the smaller the mean squared error, the better the machine learning model.

The parameters we will be considering are: regualarization(C), gamma, and kernel

In [43]:
# regularization

model_C = SVR(C = 20)
model_C.fit(x_train, y_train)

In [44]:
y_predC = model_C.predict(x_test)

In [45]:
mse_C = mean_squared_error(y_test, y_predC)
print(f'The mean squared error is {mse_C}')

The mean squared error is 0.07124967232156353


In [49]:
# gamma

model_g = SVR(gamma = 5)
model_g.fit(x_train, y_train)



In [50]:
y_predG = model_g.predict(x_test)

In [51]:
mse_g = mean_squared_error(y_test, y_predG)
print(f'The mean squared error is {mse_g}')

The mean squared error is 0.15237464500500444


In [60]:
# kernel

model_kernel = SVR(kernel = 'linear')
model_kernel.fit(x_train, y_train)

In [58]:
y_pred_kernel = model_kernel.predict(x_test)

In [59]:
mse_kernel = mean_squared_error(y_test, y_pred_kernel)
print(f'The mean squared error is {mse_kernel}')

The mean squared error is 57.92229494138541


In [67]:
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()

x_scaled = sc.fit_transform(x)

In [68]:
from sklearn.model_selection import train_test_split

x_train_scaled, x_test_scaled, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 42)

In [69]:
model_scaled = SVR()
model_scaled.fit(x_train_scaled, y_train)

In [70]:
y_pred_scaled = model_scaled.predict(x_test_scaled)

In [71]:
mse = mean_squared_error(y_test, y_pred_scaled)
mse

0.0788312805440139