In [51]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score



In [None]:
data_raw = pd.read_csv('raw.csv', header= None)
data = data_raw[0].str.split(expand=True) # splitting the single string into multiple columns
print(data.describe(include= 'all'))
data = data.apply(pd.to_numeric, errors='coerce') #Doing this since LightGBM can't work on object datatype



                 0            1            2            3            4   \
count         24000        24000        24000        24000        24000   
unique         7209        24000        24000        24000        24000   
top     1.322200000  1.729207126  3.245963193  3.157587544  2.342418377   
freq             15            1            1            1            1   

                 5            6            7            8            9   \
count         24000        24000        24000        24000        24000   
unique        24000        24000        24000        24000        24000   
top     3.795994630  4.230365499  3.396589019  4.533288081  2.168227396   
freq              1            1            1            1            1   

                 10           11           12           13           14  \
count         24000        24000        24000        24000        24000   
unique        24000        24000        24000        24000        24000   
top     2.691689379  2.

In [71]:
test_raw = pd.read_csv('test.csv' , header = None)
test = test_raw[0].str.split(expand=True)
print(test.describe(include= 'all'))
test = test.apply(pd.to_numeric, errors='coerce')



                 0            1            2            3            4   \
count           360          360          360          360          360   
unique          348          360          360          360          360   
top     1.333220000  2.137468917  4.424625276  1.550003628  1.169750378   
freq              2            1            1            1            1   

                 5            6            7            8            9   \
count           360          360          360          360          360   
unique          360          360          360          360          360   
top     1.218868548  1.053177773  3.718149624  2.401240180  3.152576470   
freq              1            1            1            1            1   

                 10           11           12           13           14  \
count           360          360          360          360          360   
unique          360          360          360          360          360   
top     3.132008550  1.

In [54]:
correlation_matrix = data.corr()
print(correlation_matrix.describe())


              0          1          2          3          4          5   \
count  41.000000  41.000000  41.000000  41.000000  41.000000  41.000000   
mean    0.151044  -0.105615   0.025347   0.025021   0.023396   0.022577   
std     0.201654   0.205551   0.157666   0.157861   0.157791   0.158011   
min    -0.759605  -0.759605  -0.093509  -0.102983  -0.094113  -0.099956   
25%     0.098331  -0.132817  -0.002136  -0.003302  -0.004330  -0.005586   
50%     0.170200  -0.120930   0.000275   0.000928  -0.001727  -0.000521   
75%     0.185946  -0.099956   0.005781   0.004219   0.003343   0.002502   
max     1.000000   1.000000   1.000000   1.000000   1.000000   1.000000   

              6          7          8          9          10         11  \
count  41.000000  41.000000  41.000000  41.000000  41.000000  41.000000   
mean    0.025376   0.022231   0.022230   0.025246   0.025247   0.027032   
std     0.157537   0.157278   0.157858   0.160302   0.159853   0.161072   
min    -0.086067  -0.058

# Looking at the means, I think column 0 and 1 are the targets, since they have the highest means, also column 0 only has about 7200 unique datas

In [72]:
y_train = data.iloc[:, 0:2]  # Assuming first two columns are targets
X_train = data.iloc[:, 2:]  
y_test = test.iloc[:, 0:2]
X_test = test.iloc[:, 2:]


# Random Forest Regressor Model

In [61]:
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


In [62]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R² Score: {r2}")

Mean Absolute Error: 0.040477095354902876
R² Score: 0.49372987991412254


# Tried to tune hyperparameters for random forest but somehow got worse results than just the default model, wasted 30 mins on this, ignore this cell. Leaving the output untouched just to show attempt.  Also had to use GPT to learn how to do this..


In [66]:
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    # Train the Random Forest model
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs= -1
    )

    # Cross-validation score
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    return scores.mean()  # Return the mean R² score

# Create a study object and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best hyperparameters and score
print("Best Parameters:", study.best_params)
print("Best R² Score:", study.best_value)


[I 2025-01-27 17:41:08,109] A new study created in memory with name: no-name-d6593b3d-9209-4e43-a20d-d0ca3ecb933f
[I 2025-01-27 17:41:19,283] Trial 0 finished with value: 0.42444331937567803 and parameters: {'n_estimators': 275, 'max_depth': 39, 'min_samples_split': 19, 'min_samples_leaf': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.42444331937567803.
[I 2025-01-27 17:41:26,017] Trial 1 finished with value: 0.44292466114467555 and parameters: {'n_estimators': 143, 'max_depth': 45, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.44292466114467555.
[I 2025-01-27 17:41:40,210] Trial 2 finished with value: 0.3499151836482689 and parameters: {'n_estimators': 522, 'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 1 with value: 0.44292466114467555.
[I 2025-01-27 17:41:51,992] Trial 3 finished with value: 0.4393389207368806 and parameters: {'n_estimators': 273, 'max_depth': 

Best Parameters: {'n_estimators': 920, 'max_depth': 27, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
Best R² Score: 0.4576560173341602


# LightGBM Model

In [74]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

model = LGBMRegressor(random_state=42)
model = MultiOutputRegressor(model)  #LightGBM can only work with one target by default, have to wrap it with MultiOutputRegressor

model.fit(X_train, y_train)

predictions = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9945
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] Start training from score 1.325139
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9945
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 39
[LightGBM] [Info] Start training from score 1.883559


In [76]:
r2_target1 = r2_score(y_test.iloc[:, 0], predictions[:, 0])  # Infinite Multiplication Factor
r2_target2 = r2_score(y_test.iloc[:, 1], predictions[:, 1])  # PPPF

print(f"R² Score for Infinite Multiplication Factor: {r2_target1}")
print(f"R² Score for PPPF: {r2_target2}")


R² Score for Infinite Multiplication Factor: 0.828330478930373
R² Score for PPPF: 0.6970344349224671


# LGBM gives the best results.