In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import TimeSeriesSplit,cross_val_score,train_test_split, GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
df = pd.read_csv("../data/cleaned/global_projection.csv")
df.mean()

Year           2029.000000
CO2(ppm)        470.360980
global_anom       1.624435
sea_level       242.449028
dtype: float64

In [2]:
df = pd.read_csv("../data/cleaned/global_anomalies.csv")
df['ds'] = df['Year'].apply(lambda x:dt.date(x,12,31))
df.tail(2)

Unnamed: 0,Year,global_anom,All forcings,Human,Natural,Anthropogenic tropospheric aerosol,Greenhouse gases,Land use,Orbital changes,Ozone,Solar,Volcanic,ds
139,2019,1.19,1.108846,0.996126,0.112721,-0.695457,1.537588,-0.233444,0.030672,0.030672,0.030672,0.046275,2019-12-31
140,2020,1.22,1.134581,1.018415,0.116166,-0.709726,1.571692,-0.23772,0.03161,0.03161,0.03161,0.048304,2020-12-31


In [3]:
#Co2 data and projection
co2_df = pd.read_csv('../data/raw/co2/monthly_in_situ_co2_mlo.csv',
    skiprows=lambda x:x in np.arange(0,54),parse_dates=True)
co2_df_projection = pd.read_csv('../data/raw/co2/projection2100.csv',parse_dates=True)
co2_df1 = co2_df.iloc[2:757,[0,4,7]].copy()

In [4]:
co2_df1 = co2_df1.groupby('Year').mean()
final_co2 = co2_df_projection.merge(co2_df1,on='Year',how='left')
final_co2 = final_co2[final_co2['Year']>= 1958].reset_index(drop=True)
final_co2.head()

Unnamed: 0,Year,Global CO2 Equivalent Emissions (GtonsCO2/year),CO2(ppm),seasonally_adjustedfit(ppm)
0,1958,20.4768,232.269,315.229
1,1959,20.5436,315.981667,315.998333
2,1960,21.1158,316.909167,316.9025
3,1961,21.652,317.643333,317.628333
4,1962,22.1765,318.454167,318.45


In [5]:
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
model_df = final_co2.dropna()
X = model_df['Global CO2 Equivalent Emissions (GtonsCO2/year)'].to_numpy().reshape(-1,1)
y = model_df['CO2(ppm)'].to_numpy().reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X,y)
model = LinearRegression()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Mean Squared Error:', mean_squared_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred)))

0.8001162267989913
0.9434977070677808
Mean Absolute Error: 4.722952176895163
Mean Squared Error: 43.43974122822095
Root Mean Squared Error: 6.590883190303175


In [6]:
pred = model.predict(final_co2[final_co2['Year']>2021]['Global CO2 Equivalent Emissions (GtonsCO2/year)'].to_numpy().reshape(-1,1))
pred_df = pd.DataFrame({'Year':np.arange(2022,2101),'Predictions':pred.ravel()})
pred_df.head()

Unnamed: 0,Year,Predictions
0,2022,429.065497
1,2023,432.093428
2,2024,435.12106
3,2025,438.148692
4,2026,441.176324


In [7]:
final_co2.iloc[64:,2] = pred.ravel()
final_co2 = final_co2.iloc[:,:3]
final_co2.head()

Unnamed: 0,Year,Global CO2 Equivalent Emissions (GtonsCO2/year),CO2(ppm)
0,1958,20.4768,232.269
1,1959,20.5436,315.981667
2,1960,21.1158,316.909167
3,1961,21.652,317.643333
4,1962,22.1765,318.454167


In [13]:
from sklearn.linear_model import Lasso
model2 = Lasso(alpha=0.1)
model2.fit(X_train,y_train)
pred2 = model2.predict(X_test)
print(model2.score(X_train,y_train))
print(model2.score(X_test,y_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred2))
print('Mean Squared Error:', mean_squared_error(y_test, pred2))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred2)))

0.8001161821247522
0.9435523873634688
Mean Absolute Error: 4.721934017335521
Mean Squared Error: 43.397702263575425
Root Mean Squared Error: 6.587693242977805


In [32]:
#temperature predictions
temp_df = df[['Year','global_anom']]
temp_df = final_co2.merge(temp_df,on='Year',how='left')
model_temp_df = temp_df.dropna()
X = model_temp_df[['Global CO2 Equivalent Emissions (GtonsCO2/year)','CO2(ppm)']]
y = model_temp_df['global_anom'].to_numpy().reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X,y)
model2 = LinearRegression()
model2.fit(X_train,y_train)
pred2 = model2.predict(X_test)
print(model2.score(X_train,y_train))
print(model2.score(X_test,y_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred2))
print('Mean Squared Error:', mean_squared_error(y_test, pred2))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred2)))

0.8865104142311051
0.8971710701254312
Mean Absolute Error: 0.08159029282608184
Mean Squared Error: 0.008964714475076522
Root Mean Squared Error: 0.09468217612136152


In [61]:
features = temp_df[temp_df['Year']>2020][['Global CO2 Equivalent Emissions (GtonsCO2/year)','CO2(ppm)']]
pred2 = model2.predict(features)
temp_df.iloc[63:,3] = pred2.ravel()
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 0 to 142
Data columns (total 4 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Year                                             143 non-null    int64  
 1   Global CO2 Equivalent Emissions (GtonsCO2/year)  143 non-null    float64
 2   CO2(ppm)                                         143 non-null    float64
 3   global_anom                                      143 non-null    float64
dtypes: float64(3), int64(1)
memory usage: 5.6 KB


In [79]:
sea_level_df = pd.read_csv('../data/cleaned/sea_level_global.csv')
final_df = temp_df.merge(sea_level_df,on='Year', how='left')
final_df.head()

Unnamed: 0,Year,Global CO2 Equivalent Emissions (GtonsCO2/year),CO2(ppm),global_anom,diff,rise_year
0,1958,20.4768,232.269,0.26,,
1,1959,20.5436,315.981667,0.23,,
2,1960,21.1158,316.909167,0.17,,
3,1961,21.652,317.643333,0.26,,
4,1962,22.1765,318.454167,0.23,,


In [74]:
#sea_level predictions
model_sea_df = final_df.dropna()
X = model_sea_df[['global_anom','CO2(ppm)']]
y = model_sea_df['diff'].to_numpy().reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X,y)
model3 = LinearRegression()
model3.fit(X_train,y_train)
pred3 = model3.predict(X_test)
print(model3.score(X_train,y_train))
print(model3.score(X_test,y_test))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred3))
print('Mean Squared Error:', mean_squared_error(y_test, pred3))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred3)))

0.9998198308671348
0.9998784374549597
Mean Absolute Error: 0.6759127317308051
Mean Squared Error: 2.695088002587067
Root Mean Squared Error: 1.641672318883116


In [80]:
features1 = final_df[final_df['Year']>2020][['global_anom','CO2(ppm)']]
pred3 = model3.predict(features1)
final_df.iloc[63:,4] = pred3.ravel()
final_df = final_df.iloc[:,[0,2,3,4]].rename({'diff':'sea_level'},axis=1)
final_df.tail()

Unnamed: 0,Year,CO2(ppm),global_anom,sea_level
138,2096,648.231475,3.252822,443.124106
139,2097,648.765149,3.257691,443.928837
140,2098,649.301805,3.262586,444.738064
141,2099,649.835479,3.267454,445.542795
142,2100,650.372135,3.27235,446.352022


In [81]:
#global data projections
final_df.to_csv('../data/cleaned/global_projection.csv',index=False)