In [1]:
import optuna
from optuna.samplers import TPESampler
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, Dataset
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
path = "/home/pydev/Music/work_files/latest_broko_code"
df = pd.read_csv(path  + "/Dataset/ML_CLEAN_DATA__Bdv2.2_RES.csv")
df.info(verbose=True,show_counts=True)

In [None]:
drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month"]
df = df.drop(drop_cols, axis=1)
df.isnull().sum()

In [None]:
data = df.dropna(axis = 0, how ='any') 

In [None]:
q1 = data['HPI_Sold_Price'].quantile(0.25)
q3 = data['HPI_Sold_Price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
lower_bound_outliers = data[data['HPI_Sold_Price'] < lower_bound] 
upper_bound_outliers = data[data['HPI_Sold_Price'] > upper_bound]
# lower_bound_outliers.to_csv(path + '/Dataset/lower_bound_outliers.csv')
# upper_bound_outliers.to_csv(path +'/Dataset/upper_bound_outliers.csv')    
dataset = data[(data['HPI_Sold_Price'] >= lower_bound) & (data['HPI_Sold_Price'] <= upper_bound)]

In [None]:
dataset.isnull().sum()

In [None]:
dataset.info(verbose=True,show_counts=True)

In [None]:
X = dataset.drop('HPI_Sold_Price',axis=1)
y = dataset['HPI_Sold_Price']

In [None]:
# categorical features
categorical_features = [column for column, dtype in X.dtypes.items() if dtype==object]


In [None]:
# Define the LightGBM model
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 5, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
    }

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le

    model = LGBMRegressor(**params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        categorical_feature=categorical_features,
        early_stopping=10,  # Early stopping rounds
        verbose=False) # Set verbose to False to suppress LightGBM's messages
    

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred)


In [None]:
# Create an Optuna study

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='minimize', sampler=sampler)

# Optimize the objective function
study.optimize(objective, n_trials=100,show_progress_bar=True)

# Get the best parameters and model type
best_params = study.best_params

# Print the best model and its parameters
print("Best hyperparameters: ", best_params)

In [None]:
best_df = pd.DataFrame([best_params])
best_df.to_csv(path + "/Dataset/best_lgb.csv",index=False)