# Support Vector Machines

In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')
df = pd.read_csv("Financial_Mexican_Firms.csv")

- Train & Test Split

In [2]:
X  = df.iloc[:, 0:10]
y  = df.iloc[:, 10]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0) 
df_train = pd.concat([X_train, y_train], axis = 1)
df_test  = pd.concat([X_test, y_test], axis = 1)

- Handling missing values

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression

# Imputation techniques
iregression_imputer = IterativeImputer(estimator = LinearRegression(), 
                                   missing_values = np.nan, 
                                   max_iter = 5, 
                                   imputation_order = 'roman', 
                                   random_state = 0)

# Apply an Imputation technique for training & testing datasets and rename columns
df_train  = pd.DataFrame(iregression_imputer.fit_transform(df_train), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI',
                                                                 'ROA'])

df_test  = pd.DataFrame(iregression_imputer.fit_transform(df_test), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI', 
                                                                 'ROA'])

# Update the training set
X_train  = df_train.iloc[:,:-1]
y_train  = df_train.ROA

X_test  = df_test.iloc[:,:-1]
y_test  = df_test.ROA

- Hyperparameter Tunning

In [4]:
# Specify different values for the tunning process
kfold  = KFold(n_splits = 5, random_state = None, shuffle = False)

kernel  = ['rbf', 'poly', 'sigmoid']
degree  = [1, 2, 3, 4]
gamma   = ['scale', 'auto']
C       = [0.01, 0.1, 1, 5]
epsilon = [0.1, 0.25, 0.75]

#Create parameter grid
myparamgrid = [{'kernel' : kernel,
                'degree' : degree, 
                'C'      : C,
                'epsilon': epsilon}]

#Create SVR object
model  = SVR()

#Grid Search CV
svmsearch   = GridSearchCV(model, 
                           myparamgrid, 
                           scoring= 'r2', 
                           cv = kfold, 
                           verbose= False)

- Fit the model and measure time to execute

In [5]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
start_time = timer(None) # timing starts from this point for "start_time" variable
svmsearch.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 0 minutes and 1.51 seconds.


- Get the best tunning parameters 

In [6]:
svmsearch.best_estimator_
svmsearch.best_params_

{'C': 5, 'degree': 4, 'epsilon': 0.1, 'kernel': 'poly'}

- Specify the optimal model

In [7]:
optimal_model = SVR(kernel     = 'poly', 
                    degree     = 4, 
                    C          = 5, 
                    epsilon    = 0.1, 
                    cache_size = 5000).fit(X_train, y_train)                          

- Evaluate performance

In [8]:
import time
start_time = time.time()

y_pred     = optimal_model.predict(X_test)
    
print('R2  :', r2_score(y_test, y_pred))
print('MAE :', mean_absolute_error(y_test, y_pred))
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Processing time: %s seconds' % round((time.time() - start_time), 4))

R2  : 0.3667561776278284
MAE : 0.03552341662914489
MSE : 0.0027332904978663642
RMSE: 0.05228088080614522
Processing time: 0.004 seconds


- Deployment

In [9]:
import pickle
file = open("SVR_Diversification.pkl", 'wb')
pickle.dump(optimal_model, file)