In [14]:
import pandas as pd
import sklearn.linear_model as lm
import sklearn.preprocessing as pre
import sklearn.model_selection as ms

import harness

In [2]:
flights_train = pd.read_csv('data/flights_train.csv', index_col=0)
flights_test = pd.read_csv('data/flights_test.csv', index_col=0)

In [3]:
flights_clean = harness.clean_train(flights_train)

In [4]:
def powers_of(df, degree):
    column_name = df.columns[0]
    scaled = pre.StandardScaler().fit_transform(df)
    dep_powers = pre.PolynomialFeatures(degree=degree).fit_transform(scaled)[:, 1:]
    return pd.DataFrame(
        dep_powers, columns = [
            f'{column_name}_{i + 1}' for i in range(degree)
        ]
    )

In [5]:
def powers_of_time(df):
    return powers_of(df[['crs_dep_time']], 6).join(
        powers_of(df[['crs_arr_time']], 6)
    )

In [6]:
transformer = harness.DataTransformer(
    powers_of_time, harness.normalize, harness.unnormalize
)

In [7]:
x_train, y_train = transformer.extract_transform(flights_clean)

In [16]:
def train_model():
    linreg = lm.Lasso(alpha=0.01, random_state=42)
    linreg.fit(x_train.values, y_train.values)
    harness.save(linreg, 'time_of_day_model')

In [17]:
train_model()

In [18]:
linreg = harness.load('time_of_day_model')
linreg_model = harness.TrainedModel(linreg, transformer)

In [19]:
linreg_model.validate(harness.clean_train(flights_test))

0.015541790152915014

In [12]:
linreg_model.submit(
    'data/test.csv', 'time_of_day_submission.csv', 'predicted_delay'
)