In [None]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.1.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://security.ubuntu.com/ubuntu bionic-securi

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

from sklearn.linear_model import LinearRegression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#  Import and read the csv.
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("World-Happiness").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()
# Read in data from S3 Buckets
from pyspark import SparkFiles

url ="https://worldhappiness.s3.us-east-2.amazonaws.com/WHR_noNull.csv"
spark.sparkContext.addFile(url)
happiness_df = spark.read.csv(SparkFiles.get("WHR_noNull.csv"), sep=",", header=True, inferSchema=True)
happiness_df.show()

+-----------+----+--------------------+---------------+-------------------+--------------+---------------+-------+------------------+-------------------------+
|    country|year|        world_region|happiness_score|economic_production|social_support|life_expectancy|freedom|        generosity|perceptions_of_corruption|
+-----------+----+--------------------+---------------+-------------------+--------------+---------------+-------+------------------+-------------------------+
|    Denmark|2005|      Western Europe|          8.019|             10.851|         0.972|           69.6|  0.971|0.1615333333333333|                    0.237|
|    Denmark|2008|      Western Europe|          7.971|              10.88|         0.954|          70.08|   0.97|             0.272|                    0.248|
|    Finland|2020|      Western Europe|          7.889|              10.75|         0.962|           72.1|  0.962|            -0.116|                    0.164|
|    Finland|2018|      Western Europe| 

In [None]:
# Split our preprocessed data into our features and target arrays
y = happiness_df["happiness_score"] # y = dependent = ladder score
X = happiness_df[["economic_production", "social_support", "life_expectancy", "freedom", "generosity", "perceptions_of_corruption"]] # X = independent = any other variable



# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=0.3)

TypeError: ignored

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Introduce regressor
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# Print the intercept and coefficients
print("Intercept: ", regressor.intercept_)
print("Coefficients:")
list(zip(X, regressor.coef_))

In [None]:
# Create vector of predictions
y_pred = regressor.predict(X_test)

In [None]:
print(f"Prediction: {y_pred}")

In [None]:
# Show actual value with predicted value
regressor_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
values = regressor_diff.sort_values('Actual value', ascending=False)
values

In [None]:
#Model Evaluation
from sklearn import metrics
MAE = metrics.mean_absolute_error(y_test, y_pred)
MSE = metrics.mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('R squared: {:.2f}'.format(regressor.score(X, y)))
print('Mean Absolute Error:', MAE)
print('Mean Square Error:', MSE)
print('Root Mean Square Error:', RMSE)

In [None]:
actual = values['Actual value']
predicted = values['Predicted value']
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize=(24, 12))

ax.plot(actual, color = 'green', label = 'Actual Score')
ax.plot(predicted, color = 'red', label = 'Predicted Score')
ax.set(xlabel='', ylabel='Happiness Score',
       title='Actual vs Predicted Happiness Scores');
ax.legend(loc = 'upper right')
plt.show()

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
fig, axs = plt.subplots(6, figsize = (8,8))
plt1 = sns.boxplot(happiness_df['economic_production'], ax = axs[0])
plt2 = sns.boxplot(happiness_df['social_support'], ax = axs[1])
plt3 = sns.boxplot(happiness_df['life_expectancy'], ax = axs[2])
plt4 = sns.boxplot(happiness_df['freedom'], ax = axs[3])
plt5 = sns.boxplot(happiness_df['generosity'], ax = axs[4])
plt6 = sns.boxplot(happiness_df['perceptions_of_corruption'], ax = axs[5])
plt.tight_layout()

In [None]:
sns.pairplot(happiness_df, x_vars=["economic_production", "social_support", "life_expectancy"],
             y_vars="happiness_score", height=7, aspect=1, kind='scatter')
plt.show()

In [None]:
sns.pairplot(happiness_df, x_vars=["freedom", "generosity", "perceptions_of_corruption"],
             y_vars="happiness_score", height=7, aspect=1, kind='scatter')
plt.show()

In [None]:
sns.pairplot(happiness_df, x_vars="world_region",
             y_vars="happiness_score", height=10, aspect=2, kind='scatter')
plt.show()

In [None]:
# RANDOM FOREST MODEL

import pandas as pd
#from path import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Read data and store into a Pandas DataFrame
url = "https://raw.githubusercontent.com/Wavelydavely/World_Happiness_Report/main/Cleaned_Data/WHR_noNull.csv"
happiness_no_null = pd.read_csv(url)
happiness_no_null.head()


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = happiness_no_null.copy()
df2['country'] = le.fit_transform(df2['country'])
df2.head()


In [None]:
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
df3 = df2.copy()
df3['year'] = le.fit_transform(df2['year'])
df3.head()

In [None]:
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
df4 = df3.copy()
df4['world_region'] = le.fit_transform(df3['world_region'])
df4.head()

In [None]:
# Define the features set.
X = df4.copy()
X = X.drop("happiness_score", axis=1)
X

In [None]:
# Define the target set.
y = df4["happiness_score"].ravel()
y[:5]

In [None]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test2 = train_test_split(X, y, random_state=78)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=128, random_state=78) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
#Model Evaluation
import numpy as np

from sklearn import metrics
MAE = metrics.mean_absolute_error(y_test2, predictions)
MSE = metrics.mean_squared_error(y_test2, predictions)
RMSE = np.sqrt(metrics.mean_squared_error(y_test2, predictions))
print('R squared: {:.2f}'.format(rf_model.score(X, y)))
print('Mean Absolute Error:', MAE)
print('Mean Square Error:', MSE)
print('Root Mean Square Error:', RMSE)

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Create vector of predictions
y_pred2 = rf_model.predict(X_test)
y_pred2

In [None]:
# Show actual value with predicted value
regressor_diff2 = pd.DataFrame({'Actual value': y_test2, 'Predicted value': y_pred2})
values2 = regressor_diff2.sort_values('Predicted value', ascending=False)
values2.reset_index(drop=True, inplace=True)
#regressor_diff2
values2

In [None]:
actual = values2['Actual value']
predicted = values2['Predicted value']
plt.style.use('seaborn-whitegrid')
fig, ax = plt.subplots(figsize=(24, 12))

ax.plot(actual, color = 'green', label = 'Actual Score')
ax.plot(predicted, color = 'red', label = 'Predicted Score')
ax.set(xlabel='', ylabel='Happiness Score',
       title='Actual vs Predicted Happiness Scores');
ax.legend(loc = 'upper right')
plt.show()