# CS 486: Machine Learning - Data Challenge 1 - Omar Hussain

## Pre-Processing Data
### Importing Packages

In [283]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

### Reading CSV Files

In [284]:
read_train_x = pd.read_csv("train_x_region.csv")
read_train_y = pd.read_csv("train_y.csv")
read_test_x = pd.read_csv("test_x_region.csv")

read_train_x.describe()

Unnamed: 0,Year,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Population Estimate
count,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0,242.0
mean,2016.954545,58.03719,5.895496,2.884285,1.172189,0.882891,0.538545,0.258015,0.188854,87744800.0
std,1.435328,41.147073,1.051886,2.456517,0.307805,0.279914,0.177976,0.188388,0.117228,258478500.0
min,2015.0,1.0,3.006,0.0712,0.10706,0.267,0.013,0.001,0.0,301101.0
25%,2016.0,23.0,5.18375,0.98275,0.975545,0.707388,0.449185,0.083025,0.099,7347570.0
50%,2017.0,51.0,5.9465,1.32746,1.234435,0.87432,0.546675,0.211095,0.16246,13690830.0
75%,2018.0,88.75,6.74375,5.377166,1.43075,1.021648,0.642162,0.427625,0.2661,73183320.0
max,2019.0,156.0,7.769,7.59,1.616463,1.558231,0.95277,0.635423,0.51912,1567995000.0


### Concatenating Datasets

In [285]:
train_x_and_y = pd.concat([read_train_x, read_train_y], axis = 1)
train_x_and_y

Unnamed: 0,Year,Country,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Population Estimate,Region,Immigration to the United States
0,2016,France,32,6.478,6.559000,1.394880,1.005080,0.837950,0.465620,0.178080,74264176,Europe & Central Asia,5473.0
1,2018,Venezuela,102,4.806,0.996000,1.469000,0.657000,0.133000,0.052000,0.056000,29174393,Latin America & Caribbean,11481.0
2,2019,Costa Rica,12,7.167,1.034000,1.441000,0.963000,0.558000,0.093000,0.144000,4657652,Latin America & Caribbean,2466.0
3,2019,Bolivia,61,5.779,0.776000,1.209000,0.706000,0.511000,0.064000,0.137000,10286729,Latin America & Caribbean,1425.0
4,2016,Nicaragua,48,5.992,6.107000,0.693840,0.895210,0.652130,0.465820,0.162920,5994622,Latin America & Caribbean,3397.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,2017,Philippines,72,5.430,5.314665,0.857699,1.253918,0.468009,0.585215,0.193513,99766788,East Asia & Pacific,46542.0
238,2018,United Kingdom,11,7.190,1.244000,1.433000,0.888000,0.464000,0.082000,0.262000,73917008,Europe & Central Asia,11867.0
239,2017,Ethiopia,119,4.460,4.377271,0.339234,0.864669,0.353410,0.408843,0.312651,78815294,Sub-Saharan Africa,15678.0
240,2019,India,140,4.015,0.755000,0.765000,0.588000,0.498000,0.085000,0.200000,1260422331,South Asia,51139.0


### Verifying Dataset Integrity

In [286]:
train_x_and_y.isna().sum()

Year                                0
Country                             0
Happiness Rank                      0
Happiness Score                     0
Economy (GDP per Capita)            0
Family                              0
Health (Life Expectancy)            0
Freedom                             0
Trust (Government Corruption)       0
Generosity                          0
Population Estimate                 0
Region                              0
Immigration to the United States    0
dtype: int64

### Displaying All Dataset Features

In [287]:
train_x_and_y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Year                              242 non-null    int64  
 1   Country                           242 non-null    object 
 2   Happiness Rank                    242 non-null    int64  
 3   Happiness Score                   242 non-null    float64
 4   Economy (GDP per Capita)          242 non-null    float64
 5   Family                            242 non-null    float64
 6   Health (Life Expectancy)          242 non-null    float64
 7   Freedom                           242 non-null    float64
 8   Trust (Government Corruption)     242 non-null    float64
 9   Generosity                        242 non-null    float64
 10  Population Estimate               242 non-null    int64  
 11  Region                            242 non-null    object 
 12  Immigrat

## Analysing Correlation Between Features and Target
### Normalization immigration data for standard deviation 

In [288]:
train_x_and_y["normalized_immigration"] = train_x_and_y["Immigration to the United States"] / train_x_and_y["Population Estimate"]

correlation_value = train_x_and_y.corr()
correlation_value["normalized_immigration"].sort_values()

Happiness Score                    -0.162831
Family                             -0.143889
Population Estimate                -0.133059
Health (Life Expectancy)           -0.085975
Freedom                            -0.075189
Generosity                         -0.073544
Trust (Government Corruption)      -0.042822
Economy (GDP per Capita)           -0.013433
Year                                0.010020
Happiness Rank                      0.148433
Immigration to the United States    0.279210
normalized_immigration              1.000000
Name: normalized_immigration, dtype: float64

## Choosing Features for Model
### Eliminating all non-numerical values

In [289]:
train_x_and_y.drop(["Year", "Country", "Region", "Immigration to the United States"], axis = 1, inplace = True)
train_x_and_y

Unnamed: 0,Happiness Rank,Happiness Score,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Population Estimate,normalized_immigration
0,32,6.478,6.559000,1.394880,1.005080,0.837950,0.465620,0.178080,74264176,0.000074
1,102,4.806,0.996000,1.469000,0.657000,0.133000,0.052000,0.056000,29174393,0.000394
2,12,7.167,1.034000,1.441000,0.963000,0.558000,0.093000,0.144000,4657652,0.000529
3,61,5.779,0.776000,1.209000,0.706000,0.511000,0.064000,0.137000,10286729,0.000139
4,48,5.992,6.107000,0.693840,0.895210,0.652130,0.465820,0.162920,5994622,0.000567
...,...,...,...,...,...,...,...,...,...,...
237,72,5.430,5.314665,0.857699,1.253918,0.468009,0.585215,0.193513,99766788,0.000467
238,11,7.190,1.244000,1.433000,0.888000,0.464000,0.082000,0.262000,73917008,0.000161
239,119,4.460,4.377271,0.339234,0.864669,0.353410,0.408843,0.312651,78815294,0.000199
240,140,4.015,0.755000,0.765000,0.588000,0.498000,0.085000,0.200000,1260422331,0.000041


## Choosing the Best Metric for the Model
### Comparing Ridge and Lasso Regression performance

In [290]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error

## Testing Ridge Regression
### The alpha value is simply the regression parameter. As you increase the regression parameter, you subsequently increase the penalty. Thus, a lower regression parameter so the result of the regression does not overfit to the training data.
### Testing solver values led to 'svd' yielding the greatest performance.

In [291]:
ridge_test = Ridge(alpha = 0.01, solver = 'svd')
ridge_test.fit(train_x_and_y, read_train_y)

Ridge(alpha=0.01, solver='svd')

In [292]:
hypothesis_ridge = ridge_test.predict(train_x_and_y)

mean_squared_error(read_train_y, hypothesis_ridge)

453944276.6542021

In [293]:
np.sqrt(mean_squared_error(read_train_y, hypothesis_ridge))

21305.968099436413

## Testing Lasso Regression
### The alpha value is simply the regression parameter. As you increase the regression parameter, you subsequently increase the penalty. Thus, a lower regression parameter so the result of the regression does not overfit to the training data.

In [294]:
lasso_test = Lasso(alpha = 0.01)
lasso_test.fit(train_x_and_y, read_train_y)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.01)

In [295]:
hypothesis_lasso = lasso.predict(train_x_and_y)

mean_squared_error(read_train_y, hypothesis_lasso)

395963032.7783118

In [296]:
np.sqrt(mean_squared_error(read_train_y, hypothesis_lasso))

19898.81988406126

## Conclusion
### I ultimately selected Lasso Regression as my metric of choice as Ridge Regression is inherently more susceptible to outliers, which a dataset of this nature will undoubtedly contain.
### Predictions can be found in the attached "Data_Challenge_01.csv"

In [297]:
hypothesis_lasso = pd.DataFrame(hypothesis_lasso, columns = ["Immigration to the United States"])
hypothesis_lasso = hypothesis_lasso.astype(int)

predictions_csv = hypothesis_lasso.to_csv("Data_Challenge_1.csv", index = None)