In [1]:
# Import pandas and numpy
import pandas as pd 
import numpy as np

# Import for splitting data into train & test for the ML models
from sklearn.model_selection import train_test_split

# Import Machine Learning Linear regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score

# Import standard scaler for scaling the data
from sklearn.preprocessing import StandardScaler

# Import metrics to calculate accuracy of models
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read the cleaned country data CSV file
df = pd.read_csv("../Resources/merged_df.csv", encoding ="unicode_escape")
df.head()

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,81.228489,6.204456,88.205112,73.029268,30.497158,84.154154,6436.226256
1,Afghanistan,1971,12.021,166.224831,25.565204,1831109000.0,8.118803,81.228489,6.204456,88.205112,37.923,30.497158,65.539705,2326.432009
2,Albania,1971,31.933,10212.316586,25.565204,206332200000.0,8.118803,81.228489,6.204456,88.205112,65.618,30.497158,74.660703,785.161526
3,Algeria,1971,39.665,359.824582,2.626642,5077222000.0,8.118803,81.228489,6.204456,88.205112,43.67,30.497158,98.999816,245.527602
4,American Samoa,1971,70.784,10212.316586,25.565204,206332200000.0,8.118803,81.228489,6.204456,88.205112,66.294149,30.497158,65.539705,2326.432009


In [3]:
# Filtering only Canada's data
Canada_df = df.loc[df.country_name == "Canada"]
Canada_df = Canada_df.reset_index(drop=True)
Canada_df

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,81.228489,6.204456,88.205112,73.029268,30.497158,84.154154,6436.226256
1,Canada,1972,75.971,5089.587902,4.988029,113082800000.0,6.3,81.228489,6.204456,88.205112,72.933902,30.497158,84.024318,6863.444335
2,Canada,1973,75.851,5838.660894,7.487647,131321900000.0,5.6,81.228489,6.204456,88.205112,73.162683,30.497158,82.793349,7084.623683
3,Canada,1974,75.731,7033.011021,10.997171,160408700000.0,5.4,81.228489,6.204456,88.205112,73.237561,30.497158,82.205504,7089.999991
4,Canada,1975,75.611,7511.211343,10.672189,173834000000.0,8.118803,81.228489,6.204456,88.205112,73.521707,30.497158,84.054438,7170.412528
5,Canada,1976,75.503,8809.26466,7.541739,206575600000.0,7.09,81.228489,6.204456,88.205112,73.856098,30.497158,83.304313,7293.698609
6,Canada,1977,75.543,8919.057461,7.976445,211612200000.0,8.05,81.228489,6.204456,88.205112,74.21561,30.497158,82.515796,7431.439886
7,Canada,1978,75.583,9123.691334,8.973723,218632900000.0,8.38,81.228489,6.204456,88.205112,74.529756,30.497158,81.227285,7561.744939
8,Canada,1979,75.623,10043.660959,9.144677,243072100000.0,7.53,81.228489,6.204456,88.205112,74.866341,30.497158,81.429416,7862.206395
9,Canada,1980,75.663,11170.563972,10.129221,273853800000.0,7.54,81.228489,6.204456,88.205112,75.078049,30.497158,80.563983,7828.837983


In [4]:
Canada_df.shape

(51, 14)

### Random Forest Regressor model on Canada's data

In [5]:
# Setting features dataframe and target vector
X = Canada_df.drop(["gdp_current", "country_name", "Year", "gdp_per_cap"], axis=1)
y = Canada_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [6]:
# Determine feature importances
importances = rfr_model.feature_importances_

# Sort the feature importances in descending order (highest to lowest)
sorted_index = importances.argsort()[::-1]

# Print the feature names and their importances
print("Feature Importances:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], importances[index]))

Feature Importances:
urbanization: 0.7180729073425828
life_exp: 0.1678048652751081
health_exp: 0.07664858637588726
ff_consumption: 0.010840146175135675
re_consumption: 0.009041867631125838
energy_use: 0.007644313571068899
inflation: 0.005313926023885311
unemployment: 0.004633387605206207
literacy_youth: 0.0
literacy_adult: 0.0


Since literacy_youth and literacy_adult has no importance to the model, I will drop these features.

Restarting the whole process with the above two columns dropped:

In [7]:
# Setting features dataframe and target vector
X = Canada_df.drop(["gdp_current", "country_name", "Year", "gdp_per_cap", "literacy_youth", "literacy_adult"], axis=1)
y = Canada_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [8]:
# Re-determining feature importances
importances = rfr_model.feature_importances_

# Sort the feature importances in descending order (highest to lowest)
sorted_index = importances.argsort()[::-1]

# Print the feature names and their importances
print("Feature Importances:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], importances[index]))

Feature Importances:
urbanization: 0.6973664441709672
life_exp: 0.1791196048253581
health_exp: 0.07448775032869126
energy_use: 0.015590373643012865
ff_consumption: 0.012541197493873023
re_consumption: 0.009767024048182428
unemployment: 0.007061889797915726
inflation: 0.004065715691999429


Good.

In [11]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test)

print("Evaluation metrics of Canadian Data (1971-2021)")
print("====================")
# Calculate the accuracy of of RFR model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score of RFR model
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Evaluation metrics of Canadian Data (1971-2021)
Test accuracy: 96.83%
R2 Score: 96.83%


### Random Forest Regressor model and Scaling on Canada's data

In [11]:
# Calculating the MSE of RFR model
mse = mean_squared_error(y_test, y_pred)
print("--------------------")
print("Mean Squared Error:", mse)

# Setting features dataframe and target vector
X = Canada_df.drop(["gdp_current", "country_name", "Year", "gdp_per_cap", "literacy_youth", "literacy_adult"], axis=1)
y = Canada_df["gdp_current"]

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [12]:
# Re-determining feature importances
importances = rfr_model.feature_importances_

# Sort the feature importances in descending order (highest to lowest)
sorted_index = importances.argsort()[::-1]

# Print the feature names and their importances
print("Feature Importances:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], importances[index]))

Feature Importances:
life_exp: 0.43079678116155357
urbanization: 0.3963384053624043
health_exp: 0.14344892663802344
ff_consumption: 0.009086405868626014
energy_use: 0.007677717441347762
re_consumption: 0.0060746822556745714
inflation: 0.0033539797689156226
unemployment: 0.003223101503454743


In [13]:
# Make predictions on the scaled test data set
y_pred = rfr_model.predict(X_test)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 70.55%
R2 Score: 70.55%


From the above results, I came to the conclusion that scaling the data drops the accuracy of regression models. 