In [1]:
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
#Models to compare
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
warnings.filterwarnings('ignore')

df = pd.read_csv("Financial_Mexican_Firms.csv")

Train & Test Split

In [2]:
X  = df.iloc[:, 0:10]
y  = df.iloc[:, 10]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0) 
df_train = pd.concat([X_train, y_train], axis = 1)
df_test  = pd.concat([X_test, y_test], axis = 1)

Handling missing values

In [3]:
import miceforest as mf

df_train_mice = mf.ImputationKernel(df_train, 
                           datasets = 5, 
                           save_all_iterations = True, 
                           random_state = 0)
df_train_mice.mice(6)
df_train_mice.complete_data(4)

df_train = pd.concat([df_train_mice.complete_data(i) for i in range(5)]).groupby(level = 0).mean()
X_train  = df_train.iloc[:, 0:10]
y_train  = df_train.iloc[:, 10]

Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

std_scaler = StandardScaler()
mmx_scaler = MinMaxScaler()
rob_scaler = RobustScaler()

X_train_scaled = pd.DataFrame(rob_scaler.fit_transform(X_train), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI'])

X_test_scaled = pd.DataFrame(rob_scaler.transform(X_test), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI'])
# Update scaled datasets
df_train_scaled = pd.concat([X_train_scaled, y_train], axis = 1)     
df_test_scaled  = pd.concat([X_test_scaled, y_test], axis = 1) 

Robust Scaling is applied 

Model Comparison
- No Feature Scaling
- Feature Scaling

In [5]:
models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('KNN', KNeighborsRegressor(n_neighbors = 4,
                                         algorithm    = 'brute',
                                         leaf_size    = 1,
                                         metric       = 'minkowski',
                                         p            = 2,
                                         weights      = 'distance')))
models.append(('SVM', SVR(kernel     = 'poly', 
                          degree     = 4, 
                          C          = 5, 
                          epsilon    = 0.1)))
models.append(('Random Forest', RandomForestRegressor(n_estimators      = 63,
                                                      min_samples_split = 16,
                                                      min_samples_leaf  = 1,
                                                      max_features      = 'auto',
                                                      max_depth         = 30)))
models.append(('XGBoost',XGBRegressor(colsample_bytree  = 0.7,
                                      learning_rate     = 0.2,
                                      max_depth         = 7, 
                                      min_child_weight  = 1,
                                      reg_lambda        = 1)))
print('Values without Scaled Features ')
import time
start_time = time.time()

for name, model in models:
    fitting = model.fit(X_train, y_train)
    y_pred  = fitting.predict(X_test)
    rsquare = r2_score(y_test, y_pred)
    mse     = mean_squared_error(y_test, y_pred)
    
    print('R2: ',round(rsquare, 5), name,"--- %s seconds ---" % (time.time() - start_time) )

Values without Scaled Features 
R2:  0.58969 LinearRegression --- 0.004999637603759766 seconds ---
R2:  -0.13649 KNN --- 0.0859987735748291 seconds ---
R2:  0.33099 SVM --- 0.08999991416931152 seconds ---
R2:  0.77304 Random Forest --- 0.182999849319458 seconds ---
R2:  0.8439 XGBoost --- 0.24100017547607422 seconds ---


In [6]:
print('Values with Scaled Features ')
import time
start_time = time.time()

for name, model in models:
    fitting = model.fit(X_train_scaled, y_train)
    y_pred  = fitting.predict(X_test_scaled)
    rsquare = r2_score(y_test, y_pred)
    mse     = mean_squared_error(y_test, y_pred)
    
    print('R2: ',round(rsquare, 5), name,"--- %s seconds ---" % (time.time() - start_time) )

Values with Scaled Features 
R2:  0.58969 LinearRegression --- 0.004236698150634766 seconds ---
R2:  0.68214 KNN --- 0.010002613067626953 seconds ---
R2:  -0.19865 SVM --- 0.0149993896484375 seconds ---
R2:  0.77154 Random Forest --- 0.10699892044067383 seconds ---
R2:  0.84388 XGBoost --- 0.15799927711486816 seconds ---
