<a href="https://colab.research.google.com/github/Wavelydavely/World_Happiness_Report/blob/test_lili/backwards_elimination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder

from sklearn.linear_model import LinearRegression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt

#  Import and read the csv.
import pandas as pd 
happiness_df = pd.read_csv("World_Happiness_Report_2021.csv")
happiness_df.head(10)

Unnamed: 0,Country,Year,World_Region,Happiness_Score,Economic_Production,Social_Support,Life_Expectancy,Freedom,Generosity,Perceptions_of_Corruption
0,Finland,2021,Western Europe,7.842,10.775,0.954,72.0,0.949,-0.098,0.186
1,Denmark,2021,Western Europe,7.62,10.933,0.954,72.7,0.946,0.03,0.179
2,Switzerland,2021,Western Europe,7.571,11.117,0.942,74.4,0.919,0.025,0.292
3,Iceland,2021,Western Europe,7.554,10.878,0.983,73.0,0.955,0.16,0.673
4,Netherlands,2021,Western Europe,7.464,10.932,0.942,72.4,0.913,0.175,0.338
5,Norway,2021,Western Europe,7.392,11.053,0.954,73.3,0.96,0.093,0.27
6,Sweden,2021,Western Europe,7.363,10.867,0.934,72.7,0.945,0.086,0.237
7,Luxembourg,2021,Western Europe,7.324,11.647,0.908,72.6,0.907,-0.034,0.386
8,New Zealand,2021,North America and ANZ,7.277,10.643,0.948,73.4,0.929,0.134,0.242
9,Austria,2021,Western Europe,7.268,10.906,0.934,73.3,0.908,0.042,0.481


In [4]:
# Split our preprocessed data into our features and target arrays
y = happiness_df["Happiness_Score"] # y = dependent = ladder score
X = happiness_df[["Economic_Production", "Social_Support", "Life_Expectancy", "Freedom", "Generosity", "Perceptions_of_Corruption"]] # X = independent = any other variable

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.3)

In [5]:
# preprocess strings. one hot in coder
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
# Introduce regressor
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
# Print the intercept and coefficients
print("Intercept: ", regressor.intercept_)
print("Coefficients:")
list(zip(X, regressor.coef_))

Intercept:  -3.2863681865789367
Coefficients:


[('Economic_Production', 0.26698479015522897),
 ('Social_Support', 2.836910157932162),
 ('Life_Expectancy', 0.0357122158599017),
 ('Freedom', 2.322455127217586),
 ('Generosity', 0.6134483778329275),
 ('Perceptions_of_Corruption', -0.2294828829935673)]

In [8]:
# Create vector of predictions
y_pred = regressor.predict(X_test)

In [9]:
print(f"Prediction: {y_pred}")

Prediction: [5.22293702 3.82553433 4.61790206 5.8712758  5.0631551  5.29472032
 4.69029587 6.10254018 5.80605036 5.38465595 4.77353665 5.97214325
 5.79510235 6.35534589 6.2813558  5.69202394 5.71187419 6.12388105
 6.32821389 6.33520764 5.91024469 4.16565021 4.26227754 5.80137922
 5.62234855 6.30062846 6.00497903 6.98885412 3.35167221 4.99471359
 3.08203224 4.50764543 3.79946245 5.43773302 6.52651079 7.04043312
 4.43295438 7.09372342 5.66938269 6.56358101 5.54715199 5.31138185
 3.83171724 5.97895884 5.51696496]


In [10]:
# Show actual value with predicted value
regressor_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
real_values = regressor_diff.sort_values('Actual value', ascending=False)
real_values

Unnamed: 0,Actual value,Predicted value
2,7.571,7.093723
4,7.464,7.040433
7,7.324,6.988854
15,7.069,6.335208
19,6.834,6.328214
21,6.647,6.526511
30,6.431,6.355346
33,6.331,6.10254
34,6.33,5.801379
39,6.189,6.563581


In [11]:
#Model Evaluation
from sklearn import metrics
MAE = metrics.mean_absolute_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('R squared: {:.2f}'.format(regressor.score(X, y)))
print('Mean Absolute Error:', MAE)
print('Mean Square Error:', MSE)
print('Root Mean Square Error:', RMSE)

R squared: 0.75
Mean Absolute Error: 0.4299545696800916
Mean Square Error: 0.30512012767410596
Root Mean Square Error: 0.5523767986384891


Backward Elimination 

In [12]:
import statsmodels.api as smf  

  import pandas.util.testing as tm


In [13]:
 # Add a column of ones in our matrix of features
X = np.append(arr = np.ones((149,1)).astype(int), axis=1, values=X)

In [14]:
# Create matrix of optimal features (X_optimal) these features are statistically significant(have high impact on ladder score)
X_optimal = X[:, [0, 1, 2, 3, 4, 5]]
regressor_opt = smf.OLS(endog = y, exog = X_optimal).fit()


In [15]:
# Summarize 
regressor_opt.summary()

0,1,2,3
Dep. Variable:,Happiness_Score,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,85.07
Date:,"Sat, 14 Aug 2021",Prob (F-statistic):,4.32e-41
Time:,19:07:27,Log-Likelihood:,-118.75
No. Observations:,149,AIC:,249.5
Df Residuals:,143,BIC:,267.5
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.1559,0.456,-6.924,0.000,-4.057,-2.255
x1,0.3093,0.087,3.569,0.000,0.138,0.481
x2,2.1679,0.659,3.289,0.001,0.865,3.471
x3,0.0339,0.013,2.534,0.012,0.007,0.060
x4,2.2870,0.482,4.743,0.000,1.334,3.240
x5,0.4916,0.319,1.541,0.125,-0.139,1.122

0,1,2,3
Omnibus:,9.397,Durbin-Watson:,1.676
Prob(Omnibus):,0.009,Jarque-Bera (JB):,9.328
Skew:,-0.561,Prob(JB):,0.00943
Kurtosis:,3.495,Cond. No.,1000.0


In [16]:
# if p value > 0.05 then remove said index by removing index from X_optimal 
X_optimal = X[:, [0, 1, 2, 3, 4]]
regressor_opt = smf.OLS(endog = y, exog = X_optimal).fit()


In [17]:
# Summarize 
regressor_opt.summary()

0,1,2,3
Dep. Variable:,Happiness_Score,R-squared:,0.744
Model:,OLS,Adj. R-squared:,0.737
Method:,Least Squares,F-statistic:,104.7
Date:,"Sat, 14 Aug 2021",Prob (F-statistic):,1.27e-41
Time:,19:08:21,Log-Likelihood:,-119.97
No. Observations:,149,AIC:,249.9
Df Residuals:,144,BIC:,265.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.1116,0.457,-6.808,0.000,-4.015,-2.208
x1,0.2918,0.086,3.381,0.001,0.121,0.462
x2,2.1659,0.662,3.270,0.001,0.857,3.475
x3,0.0331,0.013,2.463,0.015,0.007,0.060
x4,2.4982,0.465,5.378,0.000,1.580,3.416

0,1,2,3
Omnibus:,9.504,Durbin-Watson:,1.647
Prob(Omnibus):,0.009,Jarque-Bera (JB):,9.477
Skew:,-0.557,Prob(JB):,0.00875
Kurtosis:,3.535,Cond. No.,998.0
