In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn import linear_model

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import tree


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

### Load And Prepare Data

In [2]:
df_train = pd.read_csv('tmlcc-2021/train.csv')
df_test = pd.read_csv('tmlcc-2021/test.csv')
df_pre_train = pd.read_csv('tmlcc-2021/pretest.csv')

In [3]:
df_train.replace([np.inf], np.nan, inplace=True)
df_train = df_train.dropna()

In [4]:
df_X = df_train.iloc[:, [1, 2, 3, 4, 5, 11, 12]]
df_y = df_train.iloc[:, 13]

df_test_X = df_test.iloc[:, [1, 2, 3, 4, 5, 11, 12]]

In [5]:
sd_scale = preprocessing.StandardScaler()
sd_scale.fit(df_X)

StandardScaler()

In [6]:
X = sd_scale.transform(df_X)
y = df_y

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Model

In [8]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.689640706541135


In [9]:
reg = linear_model.BayesianRidge()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.6896243194669527


In [10]:
model = Pipeline([('poly', PolynomialFeatures(degree=4)),
                  ('linear', LinearRegression(fit_intercept=False))])

model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.226376084119628


In [11]:
reg = linear_model.LassoLars(alpha=.1, normalize=False)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.68909242228741


In [12]:
reg = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.6875459004561657


In [13]:
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))

log Mean absolute Error 3.4894515762370566


In [14]:
# Loading some example data
X, y = X_train, y_train

# Training classifiers
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = model = Pipeline([('poly', PolynomialFeatures(degree=4)),
                  ('linear', LinearRegression(fit_intercept=False))])
ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])
ereg = ereg.fit(X, y)

y_pred = ereg.predict(X_test)
mse = np.log(metrics.mean_absolute_error(y_test, y_pred))
print("log Mean absolute Error {}".format(mse))
print(metrics.mean_absolute_error(y_test, y_pred))

log Mean absolute Error 3.1635835306679208
23.6552134065513


### Test

In [15]:
from datetime import datetime

In [16]:
now = datetime.now()
timestamp = datetime.timestamp(now)

In [17]:
X_test = sd_scale.transform(df_test_X)
pred = ereg.predict(X_test)
ID = np.arange(68614, 85614)
pd.DataFrame({'id':ID, 'CO2_working_capacity [mL/g]': pred}).set_index('id').to_csv(f'wonderland{timestamp}.csv')

In [18]:
pred.shape

(17000,)

In [19]:
ID.shape

(17000,)

In [20]:
np.log(34)

3.5263605246161616