In [23]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score

In [2]:
data = pd.read_csv("co2.csv")
data

Unnamed: 0,time,co2
0,1958-03-29,316.1
1,1958-04-05,317.3
2,1958-04-12,317.6
3,1958-04-19,317.5
4,1958-04-26,316.4
...,...,...
2279,2001-12-01,370.3
2280,2001-12-08,370.8
2281,2001-12-15,371.2
2282,2001-12-22,371.3


In [6]:
def create_ts_data(data , window_size = 5 , target_size = 3):
    i = 1
    while i < window_size:
        data["co2_{}".format(i)] = data["co2"].shift(-i)
        i += 1
    i = 0
    while i < target_size:
        data["target_{}".format(i)] = data['co2'].shift(-i-window_size)
        i += 1
    data = data.dropna(axis = 0)

    return data

In [8]:
data['time'] = pd.to_datetime(data['time'])
data['co2'] = data['co2'].interpolate()

In [11]:
window_size = 5
target_size = 3
data = create_ts_data(data , window_size , target_size)

In [12]:
data

Unnamed: 0,time,co2,co2_1,co2_2,co2_3,co2_4,target_0,target_1,target_2
0,1958-03-29,316.1,317.3,317.6,317.5,316.4,316.90,317.20,317.50
1,1958-04-05,317.3,317.6,317.5,316.4,316.9,317.20,317.50,317.90
2,1958-04-12,317.6,317.5,316.4,316.9,317.2,317.50,317.90,317.55
3,1958-04-19,317.5,316.4,316.9,317.2,317.5,317.90,317.55,317.20
4,1958-04-26,316.4,316.9,317.2,317.5,317.9,317.55,317.20,316.85
...,...,...,...,...,...,...,...,...,...
2258,2001-07-07,372.1,371.3,371.2,370.6,369.9,369.50,369.30,369.00
2259,2001-07-14,371.3,371.2,370.6,369.9,369.5,369.30,369.00,368.40
2260,2001-07-21,371.2,370.6,369.9,369.5,369.3,369.00,368.40,368.20
2261,2001-07-28,370.6,369.9,369.5,369.3,369.0,368.40,368.20,368.00


In [14]:
target = ['target_{}'.format(i) for i in range(target_size)]
x = data.drop(['time'] + target , axis = 1)
y = data[target]
print(x)
print(y)

        co2  co2_1  co2_2  co2_3  co2_4
0     316.1  317.3  317.6  317.5  316.4
1     317.3  317.6  317.5  316.4  316.9
2     317.6  317.5  316.4  316.9  317.2
3     317.5  316.4  316.9  317.2  317.5
4     316.4  316.9  317.2  317.5  317.9
...     ...    ...    ...    ...    ...
2258  372.1  371.3  371.2  370.6  369.9
2259  371.3  371.2  370.6  369.9  369.5
2260  371.2  370.6  369.9  369.5  369.3
2261  370.6  369.9  369.5  369.3  369.0
2262  369.9  369.5  369.3  369.0  368.4

[2263 rows x 5 columns]
      target_0  target_1  target_2
0       316.90    317.20    317.50
1       317.20    317.50    317.90
2       317.50    317.90    317.55
3       317.90    317.55    317.20
4       317.55    317.20    316.85
...        ...       ...       ...
2258    369.50    369.30    369.00
2259    369.30    369.00    368.40
2260    369.00    368.40    368.20
2261    368.40    368.20    368.00
2262    368.20    368.00    367.40

[2263 rows x 3 columns]


In [15]:
train_size = 0.8
num_samples = len(x)

In [16]:
x_train = x[:int(num_samples*train_size)]
y_train = y[:int(num_samples*train_size)]
x_test = x[int(num_samples*train_size):]
y_test = y[int(num_samples*train_size):]

In [17]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1810, 5)
(453, 5)
(1810, 3)
(453, 3)


In [22]:
r2 = []
mae = []
mse = []

In [26]:
regs = [LinearRegression() for _ in range(target_size)]
for i , reg in enumerate(regs):
    reg.fit(x_train , y_train["target_{}".format(i)])

In [27]:
for i , score in enumerate(regs):
    y_pred = reg.predict(x_test)
    r2.append(r2_score(y_test["target_{}".format(i)] , y_pred))
    mae.append(mean_absolute_error(y_test["target_{}".format(i)] , y_pred))
    mse.append(mean_squared_error(y_test["target_{}".format(i)] , y_pred))

In [29]:
print("R2 Score: {}".format(r2))
print("MAE: {}".format(mae))
print("MSE: {}".format(mse))

R2 Score: [0.9838658090344214, 0.9809385652719578, 0.9734758160596046]
MAE: [0.4879071345079299, 0.5180629695734064, 0.6413431934108796]
MSE: [0.39267912555015017, 0.4623990122471334, 0.6408569983105107]
