# Linear Regression 

In [1]:
import warnings
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pylab

import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')
df = pd.read_csv("Financial_Mexican_Firms.csv")

- Train & Test Split

In [2]:
X  = df.iloc[:, 0:10]
y  = df.iloc[:, 10]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0) 
df_train = pd.concat([X_train, y_train], axis = 1)

- Handling missing values

In [3]:
import miceforest as mf

kds = mf.ImputationKernel(df_train, 
                           datasets = 5, 
                           save_all_iterations = True, 
                           random_state = 0)
# Run MICE for 6 iterations
kds.mice(6)
kds.complete_data(4)

# Update values
df_train = pd.concat([kds.complete_data(i) for i in range(5)]).groupby(level = 0).mean()
X_train  = df_train.iloc[:, 0:10]
y_train  = df_train.iloc[:, 10]

- Handling outliers (Z-score)

In [12]:
# This function returns a list of outliers assuming normality with a threshold of 2.5 standard deviations
def outlier_detector(df):
    copy_df  = df.copy()
    copy_np  = copy_df.to_numpy()
    cutoff   = 2.5
    
    outliers = []
    
    mean = np.mean(copy_np)
    std  = np.std(copy_np)
    
    for i in copy_df:
        z_score = (i - mean)/std
        if np.abs(z_score) > cutoff:
            outliers.append(i)
            
    return pd.DataFrame(outliers) 

In [13]:
for feature in df_train:
    print('Outliers: {} '.format(len(outlier_detector(df_train[feature]))), feature)

Outliers: 1  ProposedIndex
Outliers: 2  IIHH
Outliers: 1  Shannon
Outliers: 1  Size
Outliers: 5  AssetTurnover
Outliers: 4  Debt
Outliers: 5  QuickRatio
Outliers: 5  CashHoldings
Outliers: 6  ROE
Outliers: 5  ROI
Outliers: 5  ROA


- Feature Engineering
    - Feature Scaling 

In this particular case, all scaling methods affect performance (R-Square), thus, features aren´t scaled

- Specify the model

In [6]:
# X_train_constant = sm.add_constant(X_train)
optimal_model    = sm.OLS(y_train.to_numpy(), X_train.to_numpy()).fit()
print(optimal_model.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.838
Model:                            OLS   Adj. R-squared (uncentered):              0.827
Method:                 Least Squares   F-statistic:                              77.17
Date:                Thu, 29 Sep 2022   Prob (F-statistic):                    8.68e-54
Time:                        11:47:39   Log-Likelihood:                          322.80
No. Observations:                 159   AIC:                                     -625.6
Df Residuals:                     149   BIC:                                     -594.9
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

- Evaluate performance

In [7]:
import time
start_time = time.time()

y_pred     = optimal_model.predict(X_test)
    
print('R2  :', r2_score(y_test, y_pred))
print('MAE :', mean_absolute_error(y_test, y_pred))
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Processing time: %s seconds' % round((time.time() - start_time), 4))

R2  : 0.5903052448446527
MAE : 0.02543887452824191
MSE : 0.0017683785324535814
RMSE: 0.04205209308053026
Processing time: 0.003 seconds


- Deployment

In [9]:
import pickle
file = open("linear_regression.pkl", 'wb')
pickle.dump(optimal_model, file)