# Setup

In [1]:
!git clone https://github.com/FloydMayweatherMachineLearningLab/runtime_prediction_assignment.git

Cloning into 'runtime_prediction_assignment'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 3), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (18/18), 4.04 MiB | 5.07 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [None]:
# If needed, change to your path in google drive
PATH_TO_FILES = "/content/runtime_prediction_assignment"

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np
from typing import List

# EDA

In [None]:
## Convert seconds to bucket function, ~ do not edit ~
def seconds2bucket(s: float):
    if s < 1*60*60:
        return 0
    elif s < 2*60*60:
        return 1
    elif s < 4*60*60:
        return 2
    elif s < 8*60*60:
        return 3
    elif s < 12*60*60:
        return 4
    return 5

## Score function, ~ do not edit ~
def score(y_true: List[int], y_pred: List[int]):
    return f1_score(y_true, y_pred, average='weighted')

In [None]:
# Get training data
df_train = pd.read_csv(PATH_TO_FILES+"/train_data.csv")

In [None]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,seconds,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
0,40729,17482.052,f4142,1684225174927,d9df6,573f2,090a9,0,4,1,98f91,Dr Timothy Whittaker,da39a 516b9 5bab6 5a36f 14ddc 52b73 356a1 64e0...
1,15034,64.141,24559,1684220255700,23908,6d0bd,090a9,1,2,1,98f91,Steven West-Williams,6d0bd 6f700 5fd0d bfb78 f5370 32035 58d1b 1f75...
2,5553,1.301,4b920,1684220862466,dab03,42352,3ae78,1,8,1,98f91,Dr Peter Marsh,42352 c7eb6 c099a 63357 f7e66 fbd72 a2a8e dc72...
3,33420,3907.559,f7c05,1684222604220,b3bd8,d86ca,3635b,0,2,1,98f91,Anne Smith,d86ca fb773 8ab30 d061d a94a8 eb3ed 6495a 671c...
4,5054,35.489,3d27a,1684223766232,a6f73,c7fd9,09e19,1,2,1,98f91,Rachael Bond,da39a c7eb6 c099a b928c 642d7 c9e42 ce05c c0cd...


In [None]:
df_train.describe()

Unnamed: 0.1,Unnamed: 0,seconds,feature_1,feature_5,feature_6,feature_7
count,37500.0,37500.0,37500.0,37500.0,37500.0,37500.0
mean,19913.890347,2192.293933,1684222000000.0,0.873067,10.266827,1.132987
std,11478.216562,6406.286202,1874714.0,0.332903,18.304888,0.852049
min,14.0,0.0,1684219000000.0,0.0,1.0,1.0
25%,9088.5,13.9295,1684221000000.0,1.0,2.0,1.0
50%,21274.5,424.0925,1684222000000.0,1.0,4.0,1.0
75%,29185.0,1484.339,1684224000000.0,1.0,16.0,1.0
max,43067.0,85053.607,1684226000000.0,1.0,273.0,16.0


In [None]:
len(df_train['Unnamed: 0'].unique())

29323

In [None]:
#removing	"Unnamed: 0" which seems to be a running index with a few "jumps" in values and repetitions, it also does not exist in the test set
df_train.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

In [None]:
# missing values count
df_train.isnull().sum()

seconds       0
feature_0     0
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
feature_6     0
feature_7     0
feature_8     0
feature_9     0
feature_10    0
dtype: int64

In [None]:
df_train.dtypes

seconds       float64
feature_0      object
feature_1       int64
feature_2      object
feature_3      object
feature_4      object
feature_5       int64
feature_6       int64
feature_7       int64
feature_8      object
feature_9      object
feature_10     object
dtype: object

In [None]:
# Example for a solution - a naive linear regression model followed by bucket classification

numeric_features = ["feature_1", "feature_5", "feature_6", "feature_7"]
X = df_train[numeric_features]
y = df_train['seconds']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

y_pred = reg_model.predict(X_val)
bucket_pred = [seconds2bucket(pred) for pred in y_pred]
bucket_label = [seconds2bucket(label) for label in y_val]
print(f"Validation set score: {score(bucket_label, bucket_pred)}") #Weighted F1-score

Validation set score: 0.890369337979094


In [None]:
def macro_score(y_true: List[int], y_pred: List[int]):
    return f1_score(y_true, y_pred, average='macro')

print(f"Validation set macro-f1 score: {macro_score(bucket_label, bucket_pred)}") #Macro F1-score

Validation set macro-f1 score: 0.23403019744483158


In [None]:
# names which are in feature 9 in the train set and not in the test set (list was received when trying to predict and these values' one hot encoded columns were missing)
df_train.feature_9.value_counts()[df_train.feature_9.value_counts().index.isin(['Dr Brian Rahman', 'Dr Douglas Miah', 'Elliot Ryan', 'Emily Dixon', 'Hazel Jenkins', 'Jay Atkinson', 'Marcus Brown-Cook', 'Miss Aimee Daly', 'Ms Chelsea Russell'])]

Marcus Brown-Cook     10
Emily Dixon            8
Dr Brian Rahman        8
Jay Atkinson           5
Hazel Jenkins          2
Miss Aimee Daly        2
Ms Chelsea Russell     1
Dr Douglas Miah        1
Elliot Ryan            1
Name: feature_9, dtype: int64

In [None]:
# Since this is a classification problem, lets add the classes and use it as a target value instead of seconds
df_train['bucket_label'] = [seconds2bucket(label) for label in df_train['seconds']]
df_train.drop(columns=['seconds'], inplace=True)
df_train.columns

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'bucket_label'],
      dtype='object')

In [None]:
df_train.shape

(37500, 12)

In [None]:
# Dominant labels
df_train['bucket_label'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

0    87.3%
1     5.8%
2     3.5%
3     2.3%
5     0.7%
4     0.5%
Name: bucket_label, dtype: object

**Initial Notes**

Problem Definition: Multi-Class Classifiction - Predicting jobs' runtime in hours.


Classes: 0 [<1], 1, 2, 3, 4, 5 [>5] [hours]

The example of linear regression is quite good by f1-weighted, however the macro-f1 score (avg. F1 scores of all classes) is 23.4%.

Since in business terms (an assumption) correctly prediciting a longer running time is more crucial than predicting a shorter running time, assumuing:
* cost is usually related to the use of computational power
* long run-time acts as a bottleneck that prevents usage of resources which otherwise can be used for other tasks/accelerating business processes),
add this to the fact that the data is highly skewed towards one class

**therefore macro-F1 will be used instead of weighted-f1, as it is a better quality representation of the models' performance.**

In addition, some columns have Hexdecimal values.

---







# Preprocessing

* Usually the process of creating a preprocessing pipeline should be accompanied by a domain expert. In addition, there is no usage of distribution based values such as mean or median, so the process can be applied on the entire train and validation set.

In [None]:
# conversion of Hex columns to Dec
hex_columns = ["feature_0", "feature_2", "feature_3", "feature_4", "feature_8"] #without feature_10 which has very large values
df_hex_to_dec = df_train.copy()

# Define a function to convert hexadecimal to decimal
def hex_to_decimal(hex_str):
  return int(hex_str, 16)

# Apply the conversion function to the 'hex_column' and create a new column 'decimal_column'
def convert_hex_cols_to_dec(df, hex_columns):
  df_temp = df.copy()
  for col in hex_columns:
    new_decimal_col_name = col + '_dec'
    df_temp[new_decimal_col_name] = df_temp[col].apply(hex_to_decimal)
  return df_temp

df_hex_to_dec = convert_hex_cols_to_dec(df_hex_to_dec, hex_columns)

df_hex_to_dec.columns

Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'bucket_label', 'feature_0_dec', 'feature_2_dec',
       'feature_3_dec', 'feature_4_dec', 'feature_8_dec'],
      dtype='object')

In [None]:
df_hex_to_dec.head(n=3)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,bucket_label,feature_0_dec,feature_2_dec,feature_3_dec,feature_4_dec,feature_8_dec
0,f4142,1684225174927,d9df6,573f2,090a9,0,4,1,98f91,Dr Timothy Whittaker,da39a 516b9 5bab6 5a36f 14ddc 52b73 356a1 64e0...,3,999746,892406,357362,37033,626577
1,24559,1684220255700,23908,6d0bd,090a9,1,2,1,98f91,Steven West-Williams,6d0bd 6f700 5fd0d bfb78 f5370 32035 58d1b 1f75...,0,148825,145672,446653,37033,626577
2,4b920,1684220862466,dab03,42352,3ae78,1,8,1,98f91,Dr Peter Marsh,42352 c7eb6 c099a 63357 f7e66 fbd72 a2a8e dc72...,0,309536,895747,271186,241272,626577


In [None]:
#removing original columns
df_hex_to_dec = df_hex_to_dec[['feature_1', 'feature_5', 'feature_6', 'feature_7',
       'feature_9', 'feature_10', 'bucket_label', 'feature_0_dec',
       'feature_2_dec', 'feature_3_dec', 'feature_4_dec', 'feature_8_dec']]

In [None]:
df_hex_to_dec.head()

Unnamed: 0,feature_1,feature_5,feature_6,feature_7,feature_9,feature_10,bucket_label,feature_0_dec,feature_2_dec,feature_3_dec,feature_4_dec,feature_8_dec
0,1684225174927,0,4,1,Dr Timothy Whittaker,da39a 516b9 5bab6 5a36f 14ddc 52b73 356a1 64e0...,3,999746,892406,357362,37033,626577
1,1684220255700,1,2,1,Steven West-Williams,6d0bd 6f700 5fd0d bfb78 f5370 32035 58d1b 1f75...,0,148825,145672,446653,37033,626577
2,1684220862466,1,8,1,Dr Peter Marsh,42352 c7eb6 c099a 63357 f7e66 fbd72 a2a8e dc72...,0,309536,895747,271186,241272,626577
3,1684222604220,0,2,1,Anne Smith,d86ca fb773 8ab30 d061d a94a8 eb3ed 6495a 671c...,1,1014789,736216,886474,222043,626577
4,1684223766232,1,2,1,Rachael Bond,da39a c7eb6 c099a b928c 642d7 c9e42 ce05c c0cd...,0,250490,683891,819161,40473,626577


In [None]:
#splitting feature 10
feature_10_splited = df_hex_to_dec['feature_10'].str.split(expand=True).add_prefix('feature_10_').fillna('0')

#converting columns from HEX to DEC
feature_10_splited_converted_to_dec = convert_hex_cols_to_dec(feature_10_splited, feature_10_splited.columns)
feature_10_splited_converted_to_dec.drop(columns=feature_10_splited.columns, axis=1, inplace=True)

#removing 38 rows with names which do not appear in the test set
index = df_hex_to_dec[df_hex_to_dec.feature_9.isin(['Dr Brian Rahman', 'Dr Douglas Miah', 'Elliot Ryan', 'Emily Dixon', 'Hazel Jenkins', 'Jay Atkinson', 'Marcus Brown-Cook', 'Miss Aimee Daly', 'Ms Chelsea Russell'])].index

# One Hot Encoding feature_9
df_with_feature_9_one_hot_encoded = pd.get_dummies(df_hex_to_dec, columns=['feature_9'])

#Concatenate the split_values DataFrame with the original DataFrame and removal of original columns
df_manual_preprocessing = pd.concat([df_with_feature_9_one_hot_encoded, feature_10_splited_converted_to_dec], axis=1)
df_manual_preprocessing.drop(columns=['feature_10'], axis=1, inplace=True)
df_manual_preprocessing.drop(index, axis=0, inplace=True)

# All columns are now whole numbers, converting to int
df_manual_preprocessing=df_manual_preprocessing.astype(int)

In [None]:
feature_10_splited_converted_to_dec

Unnamed: 0,feature_10_0_dec,feature_10_1_dec,feature_10_2_dec,feature_10_3_dec,feature_10_4_dec,feature_10_5_dec,feature_10_6_dec,feature_10_7_dec,feature_10_8_dec,feature_10_9_dec,...,feature_10_1077_dec,feature_10_1078_dec,feature_10_1079_dec,feature_10_1080_dec,feature_10_1081_dec,feature_10_1082_dec,feature_10_1083_dec,feature_10_1084_dec,feature_10_1085_dec,feature_10_1086_dec
0,893850,333497,375478,369519,85468,338803,218785,413193,894137,632277,...,0,0,0,0,0,0,0,0,0,0
1,446653,456448,392461,785272,1004400,204853,363803,128858,884245,525507,...,0,0,0,0,0,0,0,0,0,0
2,271186,818870,788890,406359,1015398,1031538,666254,902948,716001,374202,...,0,0,0,0,0,0,0,0,0,0
3,886474,1030003,568112,853533,693416,963565,411994,422341,768712,625125,...,0,0,0,0,0,0,0,0,0,0
4,893850,818870,788890,758412,410327,826946,843868,789723,722256,753717,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37495,893850,818870,788890,652722,661449,531652,784696,403141,291504,125521,...,0,0,0,0,0,0,0,0,0,0
37496,595034,303265,818870,792802,758412,615424,615424,125914,582951,86107,...,0,0,0,0,0,0,0,0,0,0
37497,327634,590522,989885,818870,792802,406359,504868,927566,374748,767446,...,0,0,0,0,0,0,0,0,0,0
37498,271186,818870,788890,406359,1015398,1031538,666254,902948,716001,374202,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_manual_preprocessing.head()

Unnamed: 0,feature_1,feature_5,feature_6,feature_7,bucket_label,feature_0_dec,feature_2_dec,feature_3_dec,feature_4_dec,feature_8_dec,...,feature_10_1077_dec,feature_10_1078_dec,feature_10_1079_dec,feature_10_1080_dec,feature_10_1081_dec,feature_10_1082_dec,feature_10_1083_dec,feature_10_1084_dec,feature_10_1085_dec,feature_10_1086_dec
0,1684225174927,0,4,1,3,999746,892406,357362,37033,626577,...,0,0,0,0,0,0,0,0,0,0
1,1684220255700,1,2,1,0,148825,145672,446653,37033,626577,...,0,0,0,0,0,0,0,0,0,0
2,1684220862466,1,8,1,0,309536,895747,271186,241272,626577,...,0,0,0,0,0,0,0,0,0,0
3,1684222604220,0,2,1,1,1014789,736216,886474,222043,626577,...,0,0,0,0,0,0,0,0,0,0
4,1684223766232,1,2,1,0,250490,683891,819161,40473,626577,...,0,0,0,0,0,0,0,0,0,0


# AutoML using AutoGluon

Since the task is described as "not tricky and ... it is a straight forward task" and the data is hashed, I'm moving directly to finding the optimal solution.

It is important to note that AutoGluon automatically performs features selelction and creates a validation set.

### Setup

In [None]:
!pip install autogluon

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

### AutoML run

In [None]:
train_data = TabularDataset(df_manual_preprocessing) # Conversion to AutoGluon's data structure
predictor = TabularPredictor(label='bucket_label', eval_metric='f1_macro', problem_type ='multiclass').fit(train_data=train_data, time_limit=500, holdout_frac=0.2) #f1-macro

No path specified. Models will be saved in: "AutogluonModels/ag-20230726_140040/"
Beginning AutoGluon training ... Time limit = 500s
AutoGluon will save models to "AutogluonModels/ag-20230726_140040/"
AutoGluon Version:  0.8.2
Python Version:     3.10.6
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   80.45 GB / 115.66 GB (69.6%)
Train Data Rows:    37462
Train Data Columns: 1318
Label Column: bucket_label
Preprocessing data ...
Train Data Class Count: 6
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5516.03 MB
	Train Data (Original)  Memory Usage: 395.0 MB (7.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 250 features to boolean dtype a

**Best model:** #1 WeightedEnsemble_L2 | #2 LightGBM | #3 LightGBMXT  

**Best validation macro-f1 score:** 0.8706	  | 0.8705	  | 0.8362

**Weighted Ensamble** [article](https://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf)

"[The model uses] Support Vector Machines
(SVMs), artificial neural nets (ANNs), memory-based
learning (KNN), decision trees (DT), bagged decision
trees (BAG-DT), boosted decision trees (BST-DT),
and boosted stumps (BST-STMP). For each algorithm
we train models using many different parameter set-
tings...
We train about 2000 models for each problem. Some
models have excellent performance, equal to or better
than the best models reported in the literature. Other
models, however, have mediocre or even poor perfor-
mance. Rather than combine good and bad models in
an ensemble, we use forward stepwise selection from
the library of models to find a subset of models that
when averaged together yield excellent performance."

* L2 refers to the hierchy level of the model, with L1 being a base model and L2 a dependant model (and not regularization).

[AutoGluon model documention](https://auto.gluon.ai/stable/api/autogluon.tabular.models.html).


**WeightedEnsemble_L2 and LightGBM take the lead.**

# Choosing a model

Although WeightedEnsemble_L2 has the highest score with LightGBM closely behind, WeightedEnsemble_L2 is an implimentation in AutoGluon with minimal documention and even smaller community support.

Such things are **imperative** for medium-long term support of models.

Since this exercise simulates the creation of a model for production, LightGBM is the chosen model for this task.



---



### Testing without AutoGluon's feature selection

In [None]:
import lightgbm as lgb

X = df_manual_preprocessing.copy()

y = X['bucket_label']
X.drop(columns=['bucket_label'], inplace=True)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0) # 0.2 is larger than AutoGluon's 0.066

# building the lightgbm model
LightGBM_model = lgb.LGBMClassifier(objective='multiclass', metric='multi_logloss', num_class=6)

#training on the train set
LightGBM_model.fit(X_train, y_train)

#K-fold cross validation would have been better, but for a fair comparison against AutoGluon which does not perform it, a simple score is calculated
y_pred = LightGBM_model.predict(X_val)
print('#############################################################################')
print(f"Validation set LightGBM model without Auto Feature-Selection macro-f1 score: {macro_score(y_val, y_pred)}") #Macro F1-score
print(f"Validation set LightGBM model without Auto Feature-Selection weighted-f1 score: {score(y_val, y_pred)}") #Macro F1-score
print('#############################################################################')

# AutoGluon's LightGBM with 830 Unused Columns
preprocessed_df_train = TabularDataset(pd.concat([X_train, y_train], axis=1))
predictor = TabularPredictor(label='bucket_label', eval_metric='f1_macro', problem_type ='multiclass').fit(train_data=preprocessed_df_train, time_limit=500, hyperparameters={'GBM':{}}) #f1-macro
y_pred = predictor.predict(X_val, model='LightGBM')
print(f"Validation set AutoGluon's model and feature selection macro-f1 score: {macro_score(y_val, y_pred)}") #Macro F1-score
print(f"Validation set AutoGluon's model and feature selection weighted-f1 score: {score(y_val, y_pred)}") #Macro F1-score

#############################################################################
Validation set LightGBM model without Auto Feature-Selection macro-f1 score: 0.8256279846972685
Validation set LightGBM model without Auto Feature-Selection weighted-f1 score: 0.9802055621463653
#############################################################################


No path specified. Models will be saved in: "AutogluonModels/ag-20230726_153229/"
Beginning AutoGluon training ... Time limit = 500s
AutoGluon will save models to "AutogluonModels/ag-20230726_153229/"
AutoGluon Version:  0.8.2
Python Version:     3.10.6
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   79.73 GB / 115.66 GB (68.9%)
Train Data Rows:    29969
Train Data Columns: 1318
Label Column: bucket_label
Preprocessing data ...
Train Data Class Count: 6
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5956.75 MB
	Train Data (Original)  Memory Usage: 315.99 MB (5.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 241 features to boolean dtype 

Validation set AutoGluon's model and feature selection macro-f1 score: 0.8527184513838372
Validation set AutoGluon's model and feature selection weighted-f1 score: 0.9826262829851502


* Results are significtly higher, either some leakage occurs (despite training the time only on X_train) or that the model managed to capture from the train set a distribution which is more like that of the training set.



---

# Prediction on the test set


## Pipeline

In [None]:
###pipeline###

## Convert seconds to bucket function, ~ do not edit ~
def seconds2bucket(s: float):
    if s < 1*60*60:
        return 0
    elif s < 2*60*60:
        return 1
    elif s < 4*60*60:
        return 2
    elif s < 8*60*60:
        return 3
    elif s < 12*60*60:
        return 4
    return 5

# Define a function to convert hexadecimal to decimal
def hex_to_decimal(hex_str):
    try:
        decimal_value = int(hex_str, 16)
        return decimal_value
    except ValueError:
      try:
        decimal_value = int(hex_str)
        return decimal_value
      except ValueError:
        print(hex_str, ' Cannot be converted to INT, returning 0')
        return 0


# Apply the conversion function to the 'hex_column' and create a new column 'decimal_column'
def convert_hex_cols_to_dec(df, hex_columns):
  df_temp = df.copy()
  for col in hex_columns:
    new_decimal_col_name = col + '_dec'
    df_temp[new_decimal_col_name] = df_temp[col].apply(hex_to_decimal)
  return df_temp

def preprocessing(df, type):
  # conversion of Hex columns to Dec
  hex_columns = ["feature_0", "feature_2", "feature_3", "feature_4", "feature_8"] #without feature_10 which has very large values
  df_hex_to_dec = df.copy()
  df_hex_to_dec = convert_hex_cols_to_dec(df_hex_to_dec, hex_columns)

  if type=='train':
    #removing index
    df_hex_to_dec.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

    #bucketing labels
    df_hex_to_dec['bucket_label'] = [seconds2bucket(label) for label in df_train['seconds']]

    #removing continus label
    df_hex_to_dec.drop(columns=['seconds'], inplace=True)

    #removing columns which have been converted from Hex to Dec
    df_hex_to_dec = df_hex_to_dec[['feature_1', 'feature_5', 'feature_6', 'feature_7',
        'feature_9', 'feature_10', 'bucket_label', 'feature_0_dec',
        'feature_2_dec', 'feature_3_dec', 'feature_4_dec', 'feature_8_dec']]
  else:
    #removing columns which have been converted from Hex to Dec
    df_hex_to_dec = df_hex_to_dec[['feature_1', 'feature_5', 'feature_6', 'feature_7',
        'feature_9', 'feature_10', 'feature_0_dec', 'feature_2_dec', 'feature_3_dec', 'feature_4_dec', 'feature_8_dec']]

  #splitting feature 10
  feature_10_splited = df_hex_to_dec['feature_10'].str.split(expand=True).add_prefix('feature_10_')
  feature_10_splited  = feature_10_splited.fillna('0')
  #converting columns from HEX to DEC
  feature_10_splited_converted_to_dec = convert_hex_cols_to_dec(feature_10_splited, feature_10_splited.columns)
  feature_10_splited_converted_to_dec.drop(columns=feature_10_splited.columns, axis=1, inplace=True)

  # One Hot Encoding feature_9
  df_with_feature_9_one_hot_encoded = pd.get_dummies(df_hex_to_dec, columns=['feature_9'])

  if type=='train':
    #removing 38 rows with names which do not appear in the test set
    index = df_hex_to_dec[df_hex_to_dec.feature_9.isin(['Dr Brian Rahman', 'Dr Douglas Miah', 'Elliot Ryan', 'Emily Dixon', 'Hazel Jenkins', 'Jay Atkinson', 'Marcus Brown-Cook', 'Miss Aimee Daly', 'Ms Chelsea Russell'])].index
    # One Hot Encoding feature_9
    df_with_feature_9_one_hot_encoded = pd.get_dummies(df_hex_to_dec, columns=['feature_9'])

  #Concatenate the split_values DataFrame with the original DataFrame and removal of original columns
  df_manual_preprocessing = pd.concat([df_with_feature_9_one_hot_encoded, feature_10_splited_converted_to_dec], axis=1)
  df_manual_preprocessing.drop(columns=['feature_10'], axis=1, inplace=True)
  if type=='train':
    df_manual_preprocessing.drop(index, axis=0)
    df_manual_preprocessing.drop(columns=['feature_9_Dr Brian Rahman', 'feature_9_Dr Douglas Miah', 'feature_9_Elliot Ryan', 'feature_9_Emily Dixon', 'feature_9_Hazel Jenkins', 'feature_9_Jay Atkinson', 'feature_9_Marcus Brown-Cook', 'feature_9_Miss Aimee Daly', 'feature_9_Ms Chelsea Russell'],
     axis=1, inplace=True)


  #assert list(df_manual_preprocessing.columns) == training_df_columns
  df_manual_preprocessing= df_manual_preprocessing.astype('int64')
  return df_manual_preprocessing

import lightgbm as lgb

def pipeline(dataframe):
  # adding the preprocessing stage here demands too much ram
  # build and train model
  preprocessed_df_train = TabularDataset(df_train)
  predictor = TabularPredictor(label='bucket_label', eval_metric='f1_macro', problem_type ='multiclass').fit(train_data=preprocessed_df_train, time_limit=500, hyperparameters={'GBM':{}}) #f1-macro

  return predictor

In [None]:
# Since the pipeline should include all nessecry actions, we'll use the original dataframe and make sure that it is running well start to finish
from autogluon.tabular import TabularDataset, TabularPredictor

df_train = pd.read_csv(PATH_TO_FILES+"/train_data.csv")

df_train = preprocessing(df_train, 'train')
predictor = pipeline(df_train)

Train Data Class Count: 6
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11202.14 MB
	Train Data (Original)  Memory Usage: 392.7 MB (3.5% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 250 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Unused Original Features (Count: 388): ['feature_9_Allan Jones', 'feature_9_Andrea Murphy', 'feature_9_Annette Jackson', 'feature_9_Antony Adams', 'feature_9_Antony Cox-Baker', 'feature_9_Ashleigh Hunter', 'feature_9_Barbara Thomas

### Creating the CSV output

In [None]:
# Get test data
df_test = pd.read_csv(PATH_TO_FILES+"/test_data.csv")
preprocessed_df_test = preprocessing(df_test, 'test')

# predict
y_pred = predictor.predict(preprocessed_df_test, model='LightGBM')

# Save predictions
pd.Series(y_pred).to_csv(PATH_TO_FILES+"/y_preds.csv")

**Final Notes**

*  It is possible to better improve the model's results using fine-tuning, at the risk of overfitting.
* It is possible to continue the use of regression models and then segmentation as done in the example, but given LightGBM's high results, for an execrise I hope this result is sufficent.
* seems there is a bug which causes AutoGluon to train WeightEnsemble even if only LightGBM was specified, I overcomed this by specifing which modal to use when predicting.
* I'd be more than happpy to explain anything which may arises questions.

Looking forward to hearing from you,
Roei Zaady
