In [2]:
import numpy as np
import pandas as pd
import pickle as pkl

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

### Load Data

In [3]:
df = pd.read_csv('train.csv')
print(df.shape)
df.head()

(20000, 9)


Unnamed: 0,Time,OZONE,NO2,temp,humidity,no2op1,no2op2,o3op1,o3op2
0,2019-03-27 13:01:00,77.59,6.881,36.2,38.9,199.0,200.0,240.0,197.0
1,2019-03-27 13:03:00,78.71,11.057,36.3,37.7,196.0,200.0,237.0,196.0
2,2019-03-27 13:04:00,78.85,8.596,36.7,38.0,195.0,199.0,235.0,196.0
3,2019-03-27 13:07:00,79.27,7.248,37.0,37.5,193.0,198.0,233.0,195.0
4,2019-03-27 13:08:00,80.01,8.638,36.8,36.8,191.0,198.0,231.0,195.0


In [4]:
def update_time(X):
    for i in range(len(X)):
        X[i][6] = int(X[i][6].split()[1][:2]) # hour
    return X

In [5]:
X = np.array(df[['no2op1','no2op2','o3op1','o3op2','temp','humidity','Time']]).reshape(-1, 7)
Y = np.array(df[['OZONE','NO2']]).reshape(-1, 2)

X = update_time(X)

# Splitting the data into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

### Part 1: Linear Models on only 4 features

In [19]:
# Select appropriate model
# lin_model = LinearRegression()                      # (L2 loss with no regularization)
# lin_model = Ridge(alpha=0.5)                      # (L2 loss with L2 regularization)
# lin_model = Lasso(alpha=0.1)                      # (L2 loss with L1 regularization)
lin_model = ElasticNet(alpha=0.1, l1_ratio=0.5)   # (L2 loss with L1 and L2 regularization)
# lin_model = SVR(kernel='linear', C=100, gamma='auto')

In [20]:
# o3
lin_model.fit(X[:,:4], Y[:,1])
print(lin_model.score(X[:,:4], Y[:,1]))
y_pred = lin_model.predict(X[:,:4])
print(mean_absolute_error(Y[:,1], y_pred))

0.29597038267902764
6.5401000938427885


In [21]:
# no2
lin_model.fit(X_train[:,:4], Y_train[:,1])
print(lin_model.score(X_train[:,:4], Y_train[:,1]))
y_pred = lin_model.predict(X_test[:,:4])
print(mean_absolute_error(Y_test[:,1], y_pred))

0.29205535251059855
6.617778763429134


### Part 2: Linear/Non-Linear Models on all features

In [6]:
# Linear Models
# model = LinearRegression()                      # (L2 loss with no regularization)
# model = Ridge(alpha=0.5)                      # (L2 loss with L2 regularization)
# model = Lasso(alpha=0.1)                      # (L2 loss with L1 regularization)
# model = ElasticNet(alpha=0.1, l1_ratio=0.5)   # (L2 loss with L1 and L2 regularization)

# Non-Linear Models
# model = SVR(kernel='linear', C=100, gamma='auto')
# model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
# model = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1)
# model = KNeighborsRegressor(n_neighbors=6)
# model = DecisionTreeRegressor(max_depth=26, min_samples_leaf=3e-4, random_state=42)
model = RandomForestRegressor(n_estimators=500, random_state=42)
# model = MLPRegressor(hidden_layer_sizes=(100, 100, 100, 100, 100), activation='relu', solver='adam', max_iter=200, random_state=42)

In [7]:
# Without time feature

model.fit(X_train[:,:6], Y_train[:,0])
print(model.score(X_train[:,:6], Y_train[:,0]))
y_pred = model.predict(X_test[:,:6])
print(mean_absolute_error(Y_test[:,0], y_pred))

0.9880575256596679
3.7896239879577123


In [7]:
# With time feature

model.fit(X, Y[:,0])
print(model.score(X, Y[:,0]))
y_pred = model.predict(X)
print(mean_absolute_error(Y[:,0], y_pred))

0.9899023654492773
1.241584231922828


In [16]:
# Best predictor

model_o3 = KNeighborsRegressor(n_neighbors=6)
model_o3.fit(X, Y[:,0])
y_pred = model_o3.predict(X)
print(mean_absolute_error(Y[:,0], y_pred))

model_no2 = KNeighborsRegressor(n_neighbors=4)
model_no2.fit(X, Y[:,1])
y_pred = model_no2.predict(X)
print(mean_absolute_error(Y[:,1], y_pred))

models = [model_o3, model_no2]

# Save your model
with open( "model", "wb" ) as file:
    pkl.dump( models, file )

3.019486316666667
2.0417268125000003


In [17]:
%%timeit
model_o3.predict(X)

175 ms ± 3.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<!-- Store Table for Results -->
### Results



| S.No. | Model                                    | O3 (MAE) | NO2 (MAE) |
|-------|------------------------------------------|----------|-----------|
| 1     | LinearRegression (First 4 features only) | 5.5859   | 6.6177    |
| 2     | Ridge (First 4 features only)            | 5.5859   | 6.6177    |
| 3     | Lasso (First 4 features only)            | 5.5859   | 6.6177    |
| 4     | LinearRegression (Without time feature)  | 5.2947   | 6.6177    |
| 5     | LinearRegression (With all features)     | 5.2669   | 6.6177    |
| 6     | KNeighbours (6)                          | 3.7352   | 6.6177    |
| 7     | Decision Tree (26, 3e-4)                 | 4.1369   | 6.6177    |
| 8     | Random Forest (500)                      | 3.4951   | 6.6177    |
| 9     | Neural Networks                          | 4.7241   | 6.6177    |