# IIMT2641 Assignment 3
## Sibo Ding
## Spring 2023

## Environmental Setup

In [1]:
import pandas as pd
import statsmodels.api as sm

## Load the Data

In [2]:
climate_change = pd.read_csv("ClimateChange.csv")
print('First 5 rows:\n', climate_change.head())
print('\nNumber of observations and variables:', climate_change.shape)
print('\nNames of variables:', climate_change.columns)

First 5 rows:
    Year  Month    MEI     CO2      CH4      N2O   CFC.11   CFC.12        TSI  \
0  1983      5  2.556  345.96  1638.59  303.677  191.324  350.113  1366.1024   
1  1983      6  2.167  345.52  1633.71  303.746  192.057  351.848  1366.1208   
2  1983      7  1.741  344.15  1633.22  303.795  192.818  353.725  1366.2850   
3  1983      8  1.130  342.25  1631.35  303.839  193.602  355.633  1366.4202   
4  1983      9  0.428  340.17  1648.40  303.901  194.392  357.465  1366.2335   

   Aerosols   Temp  
0    0.0863  0.109  
1    0.0794  0.118  
2    0.0731  0.137  
3    0.0673  0.176  
4    0.0619  0.149  

Number of observations and variables: (308, 11)

Names of variables: Index(['Year', 'Month', 'MEI', 'CO2', 'CH4', 'N2O', 'CFC.11', 'CFC.12', 'TSI',
       'Aerosols', 'Temp'],
      dtype='object')


## Train-test Split

In [3]:
climate_train = climate_change[climate_change['Year'] <= 2006]
climate_test = climate_change[climate_change['Year'] > 2006]

## Build Linear Regression Model

In [4]:
y = climate_train['Temp']
x = sm.add_constant(climate_train[['MEI', 'CO2', 'CH4', 'N2O',
                                   'CFC.11', 'CFC.12', 'TSI', 'Aerosols']])
climate_reg1 = sm.OLS(y, x).fit()
print(climate_reg1.summary())

                            OLS Regression Results                            
Dep. Variable:                   Temp   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.744
Method:                 Least Squares   F-statistic:                     103.6
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           1.94e-78
Time:                        22:21:49   Log-Likelihood:                 280.10
No. Observations:                 284   AIC:                            -542.2
Df Residuals:                     275   BIC:                            -509.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -124.5943     19.887     -6.265      0.0

In [5]:
# R^2
climate_reg1.rsquared

0.7508932770523421

In [6]:
# Significant independent variables (with p < 0.05)
# Note: `const` is not an independent variable
climate_reg1.params[climate_reg1.pvalues < 0.05]

const      -124.594260
MEI           0.064205
CO2           0.006457
CFC.11       -0.006630
CFC.12        0.003808
TSI           0.093141
Aerosols     -1.537613
dtype: float64

The coefficients of `N2O` and `CFC.11` are negative probably because they are correlated with other variables within or beyond the model.

## Correlation

In [7]:
climate_train.corr()

Unnamed: 0,Year,Month,MEI,CO2,CH4,N2O,CFC.11,CFC.12,TSI,Aerosols,Temp
Year,1.0,-0.027942,-0.036988,0.982749,0.915659,0.993845,0.569106,0.897012,0.170302,-0.345247,0.786797
Month,-0.027942,1.0,0.000885,-0.106732,0.018569,0.013632,-0.013111,0.000675,-0.034606,0.01489,-0.099857
MEI,-0.036988,0.000885,1.0,-0.041147,-0.033419,-0.05082,0.069,0.008286,-0.154492,0.340238,0.172471
CO2,0.982749,-0.106732,-0.041147,1.0,0.87728,0.97672,0.51406,0.85269,0.177429,-0.356155,0.788529
CH4,0.915659,0.018569,-0.033419,0.87728,1.0,0.899839,0.779904,0.963616,0.245528,-0.267809,0.703255
N2O,0.993845,0.013632,-0.05082,0.97672,0.899839,1.0,0.522477,0.867931,0.199757,-0.337055,0.778639
CFC.11,0.569106,-0.013111,0.069,0.51406,0.779904,0.522477,1.0,0.868985,0.272046,-0.043921,0.40771
CFC.12,0.897012,0.000675,0.008286,0.85269,0.963616,0.867931,0.868985,1.0,0.255303,-0.225131,0.687558
TSI,0.170302,-0.034606,-0.154492,0.177429,0.245528,0.199757,0.272046,0.255303,1.0,0.052117,0.243383
Aerosols,-0.345247,0.01489,0.340238,-0.356155,-0.267809,-0.337055,-0.043921,-0.225131,0.052117,1.0,-0.384914


In [8]:
# `N2O` is highly correlated with
climate_train.corr()['N2O'][climate_train.corr()['N2O'] > 0.7]
# Note: `Temp` is the dependent variable

Year      0.993845
CO2       0.976720
CH4       0.899839
N2O       1.000000
CFC.12    0.867931
Temp      0.778639
Name: N2O, dtype: float64

In [9]:
# `CFC.11` is highly correlated with
climate_train.corr()['CFC.11'][climate_train.corr()['CFC.11'] > 0.7]

CH4       0.779904
CFC.11    1.000000
CFC.12    0.868985
Name: CFC.11, dtype: float64

## Simplify the Model

In [10]:
y = climate_train['Temp']
x = sm.add_constant(climate_train[['MEI', 'TSI', 'Aerosols', 'N2O']])
climate_reg2 = sm.OLS(y, x).fit()
print(climate_reg2.summary())

                            OLS Regression Results                            
Dep. Variable:                   Temp   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.722
Method:                 Least Squares   F-statistic:                     184.9
Date:                Wed, 12 Apr 2023   Prob (F-statistic):           3.52e-77
Time:                        22:21:49   Log-Likelihood:                 266.64
No. Observations:                 284   AIC:                            -523.3
Df Residuals:                     279   BIC:                            -505.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -116.2269     20.223     -5.747      0.0

In [11]:
# The sign of `N2O` flips.
pd.Series([climate_reg1.params['N2O'], climate_reg2.params['N2O']],
          index=['Previous N2O', 'Simplified N2O'])

Previous N2O     -0.016528
Simplified N2O    0.025320
dtype: float64

In [12]:
# Significant independent variables (with p < 0.05)
# Note: `const` is not an independent variable
climate_reg2.params[climate_reg2.pvalues < 0.05]

const      -116.226858
MEI           0.064186
TSI           0.079490
Aerosols     -1.701737
N2O           0.025320
dtype: float64

A higher proportion of independent variables is significant at 5% (4 out of 4 vs. 6 out of 8).

## Out-of-sample $R^2$

In [13]:
climate_test_x = sm.add_constant(
    climate_test[['MEI', 'TSI', 'Aerosols', 'N2O']])
climate_predict = climate_reg2.predict(climate_test_x)

SSE = sum((climate_test['Temp'] - climate_predict) ** 2)
SST = sum((climate_test['Temp'] - climate_train['Temp'].mean()) ** 2)
1 - SSE/SST

0.4967794872203861