In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
train.info()

✅ There are no missing values.  
✅ All variable types are correct.

In [None]:
train.drop('date_time', axis=1, inplace=True)
test.drop('date_time', axis=1, inplace=True)

In [None]:
print(train.describe().T[['count', 'mean', 'std']])
print(train.describe().T[['min', '25%', '50%', '75%', 'max']])

📌 Variables are in different ranges, especially the sensor data.  
📌 Both 'target_carbon_monoxide' and 'target_benzene' have min values of 0.1.

## Correlation Matrix

In [None]:
corr = train.corr()
mask = np.triu(np.ones_like(corr, dtype = bool))

plt.figure(figsize=(10, 10))
sns.heatmap(data=corr,
            mask = mask,
            annot=True,
            cbar=False,
            square=True,
            cmap='coolwarm',
            linewidths=1)
plt.show()

📌 There is some multicollinearity between the variables.

In [None]:
train.skew().sort_values(ascending=False)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [None]:
targets = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']

In [None]:
y = train[targets]
X = train.drop(targets, axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
pipe_input = [('scale', StandardScaler()),
              ('model', Lasso())]
pipe = Pipeline(pipe_input)

In [None]:
grid_params = [{}]

In [None]:
grid = GridSearchCV(estimator=pipe,
                    param_grid=grid_params,
                    scoring='neg_mean_absolute_error',
                    n_jobs=-1,
                    cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
predictions = grid.predict(X_test)

In [None]:
cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error')

In [None]:
submission_preds = grid.predict(test)

In [None]:
submission[targets[0]] = submission_preds[:, 0]
submission[targets[1]] = submission_preds[:, 1]
submission[targets[2]] = submission_preds[:, 2]

In [None]:
submission.to_csv("/kaggle/working/submission.csv", index=False)