
# TDT4173 Machine Learning Short Notebook 2

### Kaggle username: Group 43

### Blackboard group: 043

#### Team members: Ola Sæther (544629), Olav Finne Præsteng Larsen (542616), Simeon Christoffersen (543897)



In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
pd.options.mode.chained_assignment = None

# CatBoost and AutoGluon


## Path Constants

<strong> !NOTE! Change for your path during testing !NOTE! </strong>


In [2]:
def get_X_train_estimated_path(dataset):
    """
    Path for importing X_train_estimated
    Change for what the path during testing is when reviewing
    :param dataset:
    :return:
    """
    return f'../data/{dataset}/X_train_estimated.parquet'

def get_X_train_observed_path(dataset):
    """
    Path for importing X_train_observed
    Change for what the path during testing is when reviewing
    :param dataset:
    :return:
    """
    return f'../data/{dataset}/X_train_observed.parquet'

def get_target_path(dataset):
    """
    Path for importing target
    Change for what the path during testing is when reviewing
    :param dataset:
    :return:
    """
    return f"../data/{dataset}/train_targets.parquet"

def get_test_estimated_path(dataset):
    """
    Path for importing X_test_estimated
    Change for what the path during testing is when reviewing
    :param dataset:
    :return:
    """
    return f'../data/{dataset}/X_test_estimated.parquet'

## Data Grouping

In [3]:
def combine_data(dataset):
    """
    Method for combining the target datasets y, with the X_estimated and X_observed. Also adds a column for knowing what data is estimated and not. This column is called validation.
    :param dataset:
    :return:
    """
    X_train_estimated = pd.read_parquet(get_X_train_estimated_path(dataset))
    X_train_observed = pd.read_parquet(get_X_train_observed_path(dataset))
    target = pd.read_parquet(get_target_path(dataset))

    X_train_estimated['validation'] = True
    X_train_observed['validation'] = False

    df = pd.concat([X_train_observed, X_train_estimated], axis=0)
    df.rename(columns={"date_forecast":"datetime"}, inplace=True)
    df['date'] = df['datetime'].dt.date
    df['hour'] = df['datetime'].dt.hour

    target['date'] = target['time'].dt.date
    target['hour'] = target['time'].dt.hour
    df = df.merge(target, on=['date','hour'], how='inner')

    df = df.dropna(subset=['pv_measurement'])
    df.set_index('datetime', inplace=True)
    df.drop(columns=['date_calc'], inplace=True)
    df.drop(columns=['date', 'hour','time'], inplace=True)

    return df


In [4]:
categorical_features = ['precip_type_5min_idx', 'dew_or_rime_idx', 'is_day_idx', 'is_in_shadow_idx']
X_test_A = pd.read_parquet(get_test_estimated_path('A'))
X_test_A = X_test_A.rename(columns={'date_forecast': 'datetime'})
X_test_A = X_test_A.drop(columns=['date_calc'])
X_test_A.set_index("datetime", inplace=True)




X_test_B = pd.read_parquet(get_test_estimated_path('B'))
X_test_B = X_test_B.rename(columns={'date_forecast': 'datetime'})
X_test_B = X_test_B.drop(columns=['date_calc'])
X_test_B.set_index("datetime", inplace=True)



X_test_C = pd.read_parquet(get_test_estimated_path('C'))
X_test_C = X_test_C.rename(columns={'date_forecast': 'datetime'})
X_test_C = X_test_C.drop(columns=['date_calc'])
X_test_C.set_index("datetime", inplace=True)




In [5]:
merged_A = combine_data('A')
merged_B = combine_data('B')
merged_C = combine_data('C')

## Data Cleaning and feature engineering

In [6]:
def most_frequent(x):
    """
    Method used to combine rows from quarters into hours for categorical features. Takes the feature that is most present for that hour and sets is as the value for the hour
    :param x: 4 values, 1 for each quarter of the hour
    :return: The value that should be set for the hour
    """
    counts = x.value_counts()
    if counts.empty:
        return None
    return counts.index[0] if all(counts == counts.iloc[0]) else counts.idxmax()



def remove_24h_zeros(df, column):
    """
    Method to remove rolling consecutive zeros in the datasets. Includes the current row and counts number of zeros, if there are more than 24, it means the entire day is 0, then it continues until it find the next actual value, and removes those rows. They are considered "bad data".
    :param df: Dataset
    :param column: Pv_measurement column
    :return: Dataset without consecutive zeros
    """
    zeros_mask = df[column] == 0

    # Use rolling sum to count consecutive zeros, shift the window by 23 periods
    # because rolling() includes the current row by default
    rolling_zeros = zeros_mask.rolling(window=24, min_periods=24).sum().shift(-23)

    # Keep rows where the count of rolling sum of consecutive zeros is less than 24
    df_filtered = df[rolling_zeros < 24]

    return df_filtered



def feature_eng(df):
    """
    Method for feature engineering each dataset.

    Removes columns:['cloud_base_agl_m', 'ceiling_height_agl_m', 'snow_density_kgm3', 'snow_drift_idx', 'elevation_m']

    Merges the dataset from each row being a quarter, to each row being an hour. This is done differently for the different features, where the features that measures something every quarter, like direct_rad_w are summed into an hour. Features like humidity are taken the mean of each quarter to represent the hour, and features that measures something for the last hour, like clear_sky_energy_1h_J, have the last quarter taken as the value for the hour, since they measure the last hour. Categorical features like percip_type, have the most frequent value added as hourly value.

    :param df: Dataset
    :return: Dataset that has merged rows, removed columns, and added columns based on sin/cos of hour, day, month
    """
    df.drop(columns=['cloud_base_agl_m', 'ceiling_height_agl_m', 'snow_density_kgm3', 'snow_drift_idx', 'elevation_m'], inplace=True)

    sum_cols = [ 'clear_sky_rad_W','direct_rad_W', 'diffuse_rad_W', 'precip_5min_mm', 'rain_water_kgm2', 'snow_water_kgm2', 'snow_melt_10min_mm', 'super_cooled_liquid_water_kgm2']
    
    last_cols = ['clear_sky_energy_1h_J','direct_rad_1h_J','fresh_snow_1h_cm', 'diffuse_rad_1h_J','fresh_snow_6h_cm', 'fresh_snow_3h_cm', 'fresh_snow_12h_cm', 'fresh_snow_24h_cm']
    
    
    mean_columns = [col for col in df.columns if col not in categorical_features and col not in sum_cols and col not in last_cols]
    agg_dict = {col: 'mean' for col in mean_columns}
    agg_dict.update({col: most_frequent for col in categorical_features})
    agg_dict.update({col: 'sum' for col in sum_cols})
    agg_dict.update({col: 'last' for col in last_cols})

    df = df.resample('H').agg(agg_dict)

    df.sort_index(inplace=True)
    df['cos_hour'] = np.cos(2 * np.pi * df.index.hour / 24)
    df['cos_month'] = np.cos(2 * np.pi * (df.index.month) / 12)
    df['cos_day_of_month'] = np.cos(2*np.pi * df.index.day / 30 )
    df['sin_hour'] = np.sin(2 * np.pi * df.index.hour / 24)
    df['sin_day_of_month'] = np.sin(2*np.pi * df.index.day / 30 )
    df['sin_month'] = np.sin(2 * np.pi * (df.index.month) / 12)
    return df

def clean_train(df):
    """
    Method for cleaning the training data. Replaces : with _ in the colum names. Removes any days that have a daily sum of pv_measurement of 0, since no power was generated that day, which the group recognizes as bad data

    :param df: Dataset
    :return: Cleaned Dataset
    """
    df.columns = [col.replace(':', '_') for col in df.columns]

    df['DailySum'] = df.groupby(df.index.date)['pv_measurement'].transform('sum')
    df = df[df['DailySum'] > 0]
    df.drop('DailySum', axis=1, inplace=True)
    
    return df

def clean_test(df):
    """
    Method for cleaning test data. Fixes naming as above
    :param df:
    :return:
    """
    df.columns = [col.replace(':', '_') for col in df.columns]
    return df



def remove_static_pv_measurements(df, column):
    """
    Method for removing static pv_measurements. This was found during EDA and the group removes pv_measurement if it has the same value for more than 2 hours after each other, unless the value is 0, because of nighttime.

    :param df: Dataset
    :param column: Column to check for. Pv_measurement
    :return: Cleaned Dataset
    """

    # Boolean mask to check if the current value is the same as the previous and the next
    same_as_prev = (df[column].shift(1) == df[column]) & (df[column] > 0)
    same_as_next = (df[column].shift(-1) == df[column]) & (df[column] > 0)
    
    # Create a mask where either condition is True
    mask_to_drop = same_as_prev | same_as_next
    
    # Drop rows where the mask is True
    df_dropped = df[~mask_to_drop]

    return df_dropped

def remove_pv_measurements(df):
    """
    Remove pv_measurement if less than 0.1 and if sun_elevation is lower than -5 degrees
    :param df:
    :return:
    """
    # drop all rows where pv_measurement > 0.1 and sun_elevation_d < -7
    df = df[~((df['pv_measurement'] > 0.1) & (df['sun_elevation_d'] < -5))]
    return df

def clean_data(df_train, df_test, cat = True):
    """
    Main method for data cleaning. Contains all methods described above. Also sets validation and test boolean on dataset
    :param cat: Whether catboost or autogluon is running
    :param df_train:
    :param df_test:
    :return:
    """
    df_train.index = pd.to_datetime(df_train.index)
    df_train = clean_train(df_train)
    df_train['test'] = False

    df_test.index = pd.to_datetime(df_test.index)
    df_test = clean_test(df_test)

    df_test['test'] = True
    df_test['validation'] = True

    df_train = feature_eng(df_train)    
    df_test = feature_eng(df_test)
    
    df_train = remove_static_pv_measurements(df_train, 'pv_measurement')
    df_train = remove_24h_zeros(df_train, 'pv_measurement')
    if cat:
        df_train = remove_pv_measurements(df_train)

    df = pd.concat([df_train, df_test], axis=0)

    train_df = df[df['test'] == False].drop(columns=['test'])
    test_df = df[df['test'] == True].drop(columns=['test', 'pv_measurement'])

    # Asserts the testset has the correct length for easier debugging, instead of finding out after all training
    assert len(test_df) == 720
    train_df[categorical_features] = train_df[categorical_features].astype(int)
    test_df[categorical_features] = test_df[categorical_features].astype(int)
    
    return train_df, test_df

In [7]:
cleaned_A_train, cleaned_A_test = clean_data(merged_A, X_test_A)
cleaned_B_train, cleaned_B_test = clean_data(merged_B, X_test_B)
cleaned_C_train, cleaned_C_test = clean_data(merged_C, X_test_C)

# Asserts implemented to assert that there are non nan values in the datasets
assert not cleaned_A_train.isna().any().any()
assert not cleaned_A_test.isna().any().any()

assert not cleaned_B_train.isna().any().any()
assert not cleaned_B_test.isna().any().any()

assert not cleaned_C_train.isna().any().any()
assert not cleaned_C_test.isna().any().any()


## Training and validation

In [8]:
from sklearn.model_selection import train_test_split


def get_validation_data(X, seed):
    """
    Method for getting training and validation data. The split is 0.1.
    :param X: Training data
    :param seed: Seed for determining split. Added for reproducibility
    :return: X_train, X_validation
    """
    return train_test_split(X, test_size=0.1, random_state=seed)


### CatBoost

In [9]:
def get_regressor():
    """
    Method for getting the same regressor with the same hyperparameters for all training rounds
    :return: CatBoostRegressor with correct hyperparameters
    """
    return CatBoostRegressor(loss_function='MAE', verbose=500, n_estimators=19054, l2_leaf_reg=5, depth=6, random_state=42, early_stopping_rounds=100, learning_rate=0.029854477813327555)



### A

In [10]:
seeds = [42, 54, 66, 358, 123456]
preds_A = []
for i in seeds:
    X_A, Val_A = get_validation_data(cleaned_A_train, i)
    model_a = get_regressor()
    
    X_pool = Pool(X_A, label=X_A.pop('pv_measurement'), cat_features=categorical_features)
    X_pool_val = Pool(Val_A, label=Val_A.pop('pv_measurement'), cat_features=categorical_features)
    
    X_pool_test = Pool(cleaned_A_test, cat_features=categorical_features)
    
    model_a.fit(X_pool,eval_set=[X_pool_val])

    
    prediction_A = model_a.predict(X_pool_test)
    preds_A.append(prediction_A)
    
prediction_A = np.mean(preds_A, axis=0)


0:	learn: 616.9866212	test: 624.5743384	best: 624.5743384 (0)	total: 172ms	remaining: 54m 44s
500:	learn: 189.9363002	test: 190.9024445	best: 190.9024445 (500)	total: 10.7s	remaining: 6m 37s
1000:	learn: 173.6603677	test: 177.8841417	best: 177.8812507 (999)	total: 21.2s	remaining: 6m 23s
1500:	learn: 167.8491291	test: 174.2847148	best: 174.2847148 (1500)	total: 31.7s	remaining: 6m 11s
2000:	learn: 162.7864280	test: 172.3517130	best: 172.3517130 (2000)	total: 42.2s	remaining: 5m 59s
2500:	learn: 158.4476499	test: 170.4560616	best: 170.4427827 (2495)	total: 52.6s	remaining: 5m 48s
3000:	learn: 155.0901949	test: 169.3996843	best: 169.3931822 (2997)	total: 1m 3s	remaining: 5m 37s
3500:	learn: 152.0539049	test: 168.1165459	best: 168.1165459 (3500)	total: 1m 13s	remaining: 5m 25s
4000:	learn: 150.1111104	test: 167.6706009	best: 167.6695614 (3999)	total: 1m 23s	remaining: 5m 14s
4500:	learn: 148.0702632	test: 167.0083550	best: 167.0061857 (4498)	total: 1m 33s	remaining: 5m 3s
5000:	learn: 145

### B

In [11]:
preds_B = []
for i in seeds:
    X_B, Val_B = get_validation_data(cleaned_B_train,i)
    model_b = get_regressor()
    
    X_pool = Pool(X_B, label=X_B.pop('pv_measurement'), cat_features=categorical_features)
    X_pool_val = Pool(Val_B, label=Val_B.pop('pv_measurement'), cat_features=categorical_features)
    X_pool_test = Pool(cleaned_B_test,cat_features=categorical_features)
    
    model_b.fit(X_pool,eval_set=X_pool_val)
    
    
    
    prediction_B = model_b.predict(X_pool_test)
    preds_B.append(prediction_B)
    
prediction_B = np.mean(preds_B, axis=0)


0:	learn: 102.4837694	test: 101.7755848	best: 101.7755848 (0)	total: 20.5ms	remaining: 6m 31s
500:	learn: 24.4087947	test: 26.2104956	best: 26.2104956 (500)	total: 9.75s	remaining: 6m 1s
1000:	learn: 22.9604440	test: 25.3502606	best: 25.3493427 (988)	total: 19.3s	remaining: 5m 48s
1500:	learn: 22.1840960	test: 25.0673780	best: 25.0661993 (1497)	total: 28.9s	remaining: 5m 38s
2000:	learn: 21.5823521	test: 24.8790912	best: 24.8742428 (1986)	total: 38.5s	remaining: 5m 27s
2500:	learn: 21.0739991	test: 24.6557092	best: 24.6530070 (2497)	total: 48s	remaining: 5m 17s
3000:	learn: 20.5397941	test: 24.4975940	best: 24.4975940 (3000)	total: 57.5s	remaining: 5m 7s
3500:	learn: 20.1174016	test: 24.3773786	best: 24.3764902 (3471)	total: 1m 7s	remaining: 5m
4000:	learn: 19.7243861	test: 24.2468284	best: 24.2453412 (3993)	total: 1m 17s	remaining: 4m 50s
4500:	learn: 19.3673498	test: 24.1616414	best: 24.1601585 (4495)	total: 1m 26s	remaining: 4m 40s
5000:	learn: 19.0741022	test: 24.1073678	best: 24.0

### C

In [12]:
preds_C = []
for i in seeds:
    X_C, Val_C = get_validation_data(cleaned_C_train,i)
    model_c = get_regressor()
    
    X_pool = Pool(X_C, label=X_C.pop('pv_measurement'), cat_features=categorical_features)
    X_pool_val = Pool(Val_C, label=Val_C.pop('pv_measurement'), cat_features=categorical_features)
    X_pool_test = Pool(cleaned_C_test, cat_features=categorical_features)
    
    model_c.fit(X_pool,eval_set=X_pool_val)
    
    
    
    prediction_C = model_c.predict(X_pool_test)
    preds_C.append(prediction_C)
    
prediction_C = np.mean(preds_C, axis=0)


0:	learn: 93.2805971	test: 90.8268406	best: 90.8268406 (0)	total: 17.9ms	remaining: 5m 40s
500:	learn: 21.5595037	test: 22.6023289	best: 22.6023289 (500)	total: 8.75s	remaining: 5m 24s
1000:	learn: 20.2206213	test: 21.9536612	best: 21.9536612 (999)	total: 17.4s	remaining: 5m 13s
1500:	learn: 19.3013282	test: 21.5367529	best: 21.5367529 (1500)	total: 26s	remaining: 5m 3s
2000:	learn: 18.5691417	test: 21.2809085	best: 21.2808937 (1996)	total: 34.5s	remaining: 4m 54s
2500:	learn: 18.0810763	test: 21.1224701	best: 21.1175786 (2484)	total: 43.1s	remaining: 4m 45s
3000:	learn: 17.6158475	test: 20.9922030	best: 20.9922030 (3000)	total: 51.6s	remaining: 4m 36s
3500:	learn: 17.2353239	test: 20.9330698	best: 20.9308635 (3499)	total: 1m	remaining: 4m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 20.89735612
bestIteration = 3708

Shrink model to first 3709 iterations.
0:	learn: 93.1190987	test: 93.3112604	best: 93.3112604 (0)	total: 18.4ms	remaining: 5m 49s
500:	learn: 21.

In [13]:

assert len(prediction_A) == 720 and len(prediction_B) == 720 and len(prediction_C) == 720
from datetime import datetime

t = []
t.extend(prediction_A)
t.extend(prediction_B)
t.extend(prediction_C)


t = [max(i,0) for i in t] # Remove any negative values

df_cat = pd.DataFrame({'id': range(720 * 3), 'prediction': [0] * (720 * 3)})
df_cat['prediction'] = t


### AutoGluon

In [14]:

from autogluon.tabular import TabularPredictor


def auto_gl(train_data, test_data,validation_data, n=5):
    validation_data = validation_data.copy()
    predictor = TabularPredictor(label='pv_measurement',eval_metric='mae', problem_type='regression')


    predictor.fit(train_data, presets=['best_quality'], time_limit=10800)
    predictor.evaluate(validation_data, silent=False)
    top_models = predictor.leaderboard(validation_data, silent=False)

    top_models = top_models.head(n)  # Select the top 10 models
    top_model_predictions = []
    for model in top_models['model']:
        model_predictions = predictor.predict(test_data, model=model)
        top_model_predictions.append(model_predictions)

    mean_prediction = np.mean(top_model_predictions, axis=0)
    submit_pred = mean_prediction.tolist()
    mae = predictor.predict(validation_data.drop(columns=['pv_measurement']))


    return submit_pred


In [15]:

X_test_A = pd.read_parquet('../data/A/X_test_estimated.parquet')
X_test_A = X_test_A.rename(columns={'date_forecast': 'datetime'})
X_test_A = X_test_A.drop(columns=['date_calc'])
X_test_A.set_index("datetime", inplace=True)




X_test_B = pd.read_parquet('../data/B/X_test_estimated.parquet')
X_test_B = X_test_B.rename(columns={'date_forecast': 'datetime'})
X_test_B = X_test_B.drop(columns=['date_calc'])
X_test_B.set_index("datetime", inplace=True)



X_test_C = pd.read_parquet('../data/C/X_test_estimated.parquet')
X_test_C = X_test_C.rename(columns={'date_forecast': 'datetime'})
X_test_C = X_test_C.drop(columns=['date_calc'])
X_test_C.set_index("datetime", inplace=True)
merged_A = combine_data('A')
merged_B = combine_data('B')
merged_C = combine_data('C')

cleaned_A_train, cleaned_A_test = clean_data(merged_A, X_test_A, cat=False)
cleaned_A_train[categorical_features] = cleaned_A_train[categorical_features].astype("category")
cleaned_A_test[categorical_features] = cleaned_A_test[categorical_features].astype("category")


cleaned_B_train, cleaned_B_test = clean_data(merged_B, X_test_B, cat=False)
cleaned_B_train[categorical_features] = cleaned_B_train[categorical_features].astype("category")
cleaned_B_test[categorical_features] = cleaned_B_test[categorical_features].astype("category")

cleaned_C_train, cleaned_C_test = clean_data(merged_C, X_test_C, cat=False)
cleaned_C_train[categorical_features] = cleaned_C_train[categorical_features].astype("category")
cleaned_C_test[categorical_features] = cleaned_C_test[categorical_features].astype("category")

assert not cleaned_A_train.isna().any().any()
assert not cleaned_A_test.isna().any().any()

assert not cleaned_B_train.isna().any().any()
assert not cleaned_B_test.isna().any().any()

assert not cleaned_C_train.isna().any().any()
assert not cleaned_C_test.isna().any().any()


### A

In [16]:

X_A, Val_A = get_validation_data(cleaned_A_train.copy(), seeds[0])
cleaned_A_test_copy = cleaned_A_test.copy()




prediction_A_auto = auto_gl(X_A, cleaned_A_test_copy, Val_A, n=4)



No path specified. Models will be saved in: "AutogluonModels\ag-20231110_184206\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231110_184206\"
AutoGluon Version:  0.8.2
Python Version:     3.10.2
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   31.39 GB / 999.39 GB (3.1%)
Train Data Rows:    30604
Train Data Columns: 47
Label Column: pv_measurement
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    23516.38 MB
	Train Data (Original)  Memory Usage: 6.24 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGener

                     model  score_test   score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3 -135.753934 -141.265531       19.807871     267.925849  1523.939124                 0.004001                0.000000           0.281977            3       True         22
1   NeuralNetFastAI_BAG_L2 -136.059168 -144.823581       18.558798     264.875533  1358.496550                 0.357015                0.495014          39.810341            2       True         18
2   RandomForestMSE_BAG_L2 -136.913910 -143.963865       18.531797     265.593567  1335.702886                 0.330014                1.213048          17.016677            2       True         15
3          LightGBM_BAG_L2 -137.131109 -144.347790       18.258786     264.512519  1322.198401                 0.057003                0.132000           3.512192            2       True         14
4     Extr

### B

In [17]:

X_B, Val_B = get_validation_data(cleaned_B_train.copy(), seeds[0])
cleaned_B_test_copy = cleaned_B_test.copy()

prediction_B_auto = auto_gl(X_B, cleaned_B_test_copy, Val_B, n=4)



No path specified. Models will be saved in: "AutogluonModels\ag-20231110_191122\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231110_191122\"
AutoGluon Version:  0.8.2
Python Version:     3.10.2
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   28.82 GB / 999.39 GB (2.9%)
Train Data Rows:    23740
Train Data Columns: 47
Label Column: pv_measurement
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    21422.86 MB
	Train Data (Original)  Memory Usage: 4.84 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGener

                     model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   NeuralNetFastAI_BAG_L2  -20.598364 -20.891609       16.700499     208.120447  1269.212930                 0.307007                0.384009          29.975584            2       True         18
1      WeightedEnsemble_L3  -20.837830 -20.247386       17.904525     210.490481  1365.112838                 0.003000                0.000999           0.446010            3       True         22
2     ExtraTreesMSE_BAG_L2  -20.888053 -20.448311       16.676497     208.549463  1241.719451                 0.283005                0.813025           2.482104            2       True         17
3      WeightedEnsemble_L2  -21.078226 -20.481766        7.146753      99.471103   281.723789                 0.009001                0.000000           0.511012            2       True         12
4     LightGBML

### C

In [18]:

X_C, Val_C = get_validation_data(cleaned_C_train.copy(), seeds[0])
cleaned_C_test_copy = cleaned_C_test.copy()

prediction_C_auto = auto_gl(X_C, cleaned_C_test_copy, Val_C, n=4)



No path specified. Models will be saved in: "AutogluonModels\ag-20231110_193743\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20231110_193743\"
AutoGluon Version:  0.8.2
Python Version:     3.10.2
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   25.31 GB / 999.39 GB (2.5%)
Train Data Rows:    18166
Train Data Columns: 47
Label Column: pv_measurement
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    21906.9 MB
	Train Data (Original)  Memory Usage: 3.71 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenera

                     model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   NeuralNetFastAI_BAG_L2  -17.971541 -19.052365       13.509895     144.848181  1120.046041                 0.248003                0.295008          24.529142            2       True         18
1      WeightedEnsemble_L3  -17.984148 -18.659349       14.652917     146.819204  1193.766681                 0.010000                0.000000           0.469007            3       True         22
2     ExtraTreesMSE_BAG_L2  -18.070235 -18.914575       13.467898     145.170184  1097.169929                 0.206005                0.617010           1.653030            2       True         17
3   RandomForestMSE_BAG_L2  -18.131953 -18.997537       13.463895     145.170182  1103.471557                 0.202003                0.617008           7.954659            2       True         15
4        LightG

In [19]:

assert len(prediction_A_auto) == 720 and len(prediction_B_auto) == 720 and len(prediction_C_auto) == 720
from datetime import datetime

t_auto = []
t_auto.extend(prediction_A_auto)
t_auto.extend(prediction_B_auto)
t_auto.extend(prediction_C_auto)


t_auto = [max(i,0) for i in t_auto] # Remove any negative values

df_autogluon = pd.DataFrame({'id': range(720 * 3), 'prediction': [0] * (720 * 3)})
df_autogluon['prediction'] = t_auto


## Combine predictions to submit file
We got the best results using a 90/10 split cat/autogluon, but think a 70/30 is more robust

In [20]:

df = pd.DataFrame()
df['id'] = df_cat['id']
df['prediction'] = (df_cat['prediction'] * 0.7 + df_autogluon['prediction']*0.3) * 0.4 + df_cat['prediction'] * 0.6

df.to_csv(f'submission_{str(datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))}_notebook_2.csv', index=False)
