# Data Analysis (Regression): Ridge Regression
Ridge Regression is a modified version of Linear Regression where a penalty parameter is added to the loss function to minimze the complexity of the model.

## Import Libraries

In [1]:
# Import Required Modules and Packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import sys

from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge

In [2]:
cd

/root


## Load Data

In [3]:
# Upload the X and y Data
X_train = pd.read_csv('Project/Cleaned/Regression/X_train.csv', na_values = ['..'])
X_test = pd.read_csv('Project/Cleaned/Regression/X_test.csv', na_values = ['..'])
y_train = pd.read_csv('Project/Cleaned/Regression/y_train.csv', na_values = ['..'])
y_test = pd.read_csv('Project/Cleaned/Regression/y_test.csv', na_values = ['..'])

# Drop Column
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
y_train.drop(['Unnamed: 0'], axis=1, inplace=True)
y_test.drop(['Unnamed: 0'], axis=1, inplace=True)

# Load Dictionary
dictionary_series = np.load('Project/Cleaned/dictionary_series.npy',allow_pickle='TRUE').item()

In [4]:
# Shape of Train and Test Data
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)

X_test: (999, 20)
y_test: (999, 1)
X_train: (3993, 20)
y_train: (3993, 1)


## Ridge Regression

In [6]:
# Create and Fit Model
model_ridge = Ridge(alpha=0.01).fit(X_train, y_train.values.ravel()) 

# Model Coefficients
model_coefficients_ridge = model_ridge.coef_

# Model Prediction
y_prediction_train = model_ridge.predict(X_train)
y_prediction_test = model_ridge.predict(X_test)

## Evaluate Model

In [7]:
# Model Results
model_results = [['R-Squared', r2_score(y_train, y_prediction_train), r2_score(y_test, y_prediction_test)],
                 ['Mean Absolute Error', metrics.mean_absolute_error(y_train, y_prediction_train), metrics.mean_absolute_error(y_test, y_prediction_test)],
                 ['Mean Squared Error', metrics.mean_squared_error(y_train, y_prediction_train), metrics.mean_squared_error(y_test, y_prediction_test)],
                 ['Root Mean Squared Error', np.sqrt(metrics.mean_squared_error(y_train, y_prediction_train)), np.sqrt(metrics.mean_squared_error(y_test, y_prediction_test))]]
model_results_ridge = pd.DataFrame(model_results, columns=['Evaluation Metric', 'Train', 'Test'])
model_results_ridge

Unnamed: 0,Evaluation Metric,Train,Test
0,R-Squared,0.291732,0.296716
1,Mean Absolute Error,1.100938,1.148211
2,Mean Squared Error,3.009843,3.373654
3,Root Mean Squared Error,1.73489,1.836751


In [8]:
# Model Coefficients
model_coefficients_ridge_table = pd.DataFrame(model_ridge.coef_.flatten(), X_train.columns, columns=['Coefficient'])
model_coefficients_ridge_table

Unnamed: 0,Coefficient
DT.NFL.UNDP.CD,0.050123
DT.NFL.UNFP.CD,0.33044
EG.CFT.ACCS.ZS,0.26505
EG.ELC.ACCS.RU.ZS,-0.011401
EG.ELC.ACCS.UR.ZS,0.052915
EG.FEC.RNEW.ZS,0.427167
IT.MLT.MAIN.P2,-0.275813
NV.AGR.TOTL.ZS,-0.078507
SE.PRM.ENRL.TC.ZS,0.041769
SE.PRM.NENR,-0.176073


In [9]:
# Series Dictionary
dictionary_series['DT.NFL.UNDP.CD']

{'Series Name': 'Net official flows from UN agencies, UNDP (current US$)'}