In [1]:
# Install Google Cloud SDK
!curl https://sdk.cloud.google.com | bash

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   443  100   443    0     0   2629      0 --:--:-- --:--:-- --:--:--  2636
Downloading Google Cloud SDK install script: https://dl.google.com/dl/cloudsdk/channels/rapid/install_google_cloud_sdk.bash
############################################################################################# 100.0%
Running install script from: /tmp/tmp.j56bnpwapY/install_google_cloud_sdk.bash
which curl
curl -# -f https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz
############################################################################################# 100.0%

mkdir -p /root
"/root/google-cloud-sdk" already exists and may contain out of date files.
Remove /root/google-cloud-sdk or select a new installation directory, then run again.


In [2]:
# Authenticating GCP and Colab
from google.colab import auth
auth.authenticate_user()

In [3]:
# Setting up GCP project
!gcloud config set project 'fa24-i535-skollep-ghgemissions'

Updated property [core/project].


In [4]:
# Import necessary libraries
from google.cloud import storage
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import plotly.express as px

In [5]:
#storage client
storage_client = storage.Client('fa24-i535-skollep-ghgemissions')

#bucket name and folder name
bucket_name = 'ghg_emissions_analysis_bucket'

# Creating bucket object
bucket = storage_client.get_bucket(bucket_name)

# List of files to download
file_name = 'cleaned_ghg_emissions_dataset.csv'

# Downloading and reading the CSV files into pandas DataFrames
# Create a blob object
blob = bucket.blob(f'{file_name}')
# Download the contents of the blob to a local file
local_file_path = f'/content/{file_name}'
blob.download_to_filename(local_file_path)
df = pd.read_csv(local_file_path)

In [6]:
df.head()

Unnamed: 0,Entity,Year,GDP per capita,Population,Urban share,CO₂ emissions,N2O emissions,Methane emissions,Energy use per person,GHG emissions per capita
0,Afghanistan,1981,10255.1085,11937587.0,16.562,0.165734,0.278202,0.97243,786.8369,1.687558
1,Afghanistan,1982,10255.1085,10991382.0,17.147,0.190566,0.306399,1.045692,926.65125,1.80414
2,Afghanistan,1983,10255.1085,10917986.0,17.747,0.230808,0.290531,1.009258,1149.1959,1.78283
3,Afghanistan,1984,10255.1085,11190220.0,18.365,0.252143,0.268575,0.9004,1121.5729,1.643149
4,Afghanistan,1985,10255.1085,11426855.0,18.997,0.30642,0.244525,0.817104,1067.0709,1.56564


In [7]:
# Choropleth map for GHG emissions per capita
fig = px.choropleth(df,
                    locations="Entity",
                    locationmode="country names",
                    color="GHG emissions per capita",
                    hover_name="Entity",
                    animation_frame="Year",
                    color_continuous_scale=px.colors.sequential.Cividis,
                    projection="natural earth",
                    title="Per Capita Greenhouse Gas Emissions by Country"
)
fig.update_layout(
    geo=dict(showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="white"),
    coloraxis_colorbar_title="GHG Emissions per Capita",
    width=1300,
    height=800,
)
fig.show()

# Save the plot as an HTML file
fig.write_html('ghg_emissions_map.html')

# Upload the map in html format to GCP bucket
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob('ghg_emissions_map.html')
blob.upload_from_filename('ghg_emissions_map.html')

In [8]:
features = df.drop(columns=["GHG emissions per capita"])
target = df["GHG emissions per capita"]

# One-hot encoding for categorical features
features = pd.get_dummies(features, columns=["Entity"], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [9]:
#train-test split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

#model - linear regression
model = LinearRegression()
model.fit(X_train, y_train)

#predictions
y_pred = model.predict(X_test)

## Evaluating performance
print("Linear Regression:")
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2}")
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

print("\nTarget variable range:", target.min(), "-", target.max())
print("Mean of target variable:", target.mean())


#Gradient Boosting Regressor
gb = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

# Calculating Metrics
gb_r2 = r2_score(y_test, y_pred_gb)
gb_mse = mean_squared_error(y_test, y_pred_gb)

# Print Metrics
print("\nGradient Boosting:")
print(f"R² Score: {gb_r2}")
print(f"MSE: {gb_mse}")

Linear Regression:
R-squared (R²): 0.9619176642883988
MSE: 1.747533475027548

Target variable range: 0.6038687 - 43.737896
Mean of target variable: 7.735305173357646

Gradient Boosting:
R² Score: 0.9505955465095186
MSE: 2.2670861615181033


In [10]:
#savings the test features and their predictions
# Create DataFrame for test set predictions
df_test_predictions = pd.DataFrame({
    'Linear_Regression_Predictions': y_pred,
    'Gradient_Boosting_Predictions': y_pred_gb,
    'Actual_Target': y_test.reset_index(drop=True)
})
df_test_predictions.to_csv('test_predictions.csv', index=False)
print("Predictions and actual values saved to 'test_predictions.csv'")

# Upload the predictions to GCP bucket
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob('test_predictions.csv')
blob.upload_from_filename('test_predictions.csv')

Predictions and actual values saved to 'test_predictions.csv'
