In [1]:
# This cell fits a random forest on data

import os
import sys

import pandas as pd

from data_selector import Data_selector
from feature_selector import Feature_selector
from logs.logger import CustomLogger
from models import Random_Forest

project_root = os.path.abspath("U:/ML_project/bargh/")
sys.path.insert(0, project_root)

logger = CustomLogger(name="model_main", log_file_name='model_main.log').get_logger()

if __name__ == "__main__":
    df = pd.read_csv("U:/ML_project/bargh/data/processed/integrated.csv", encoding='utf-8')

    data_selector = Data_selector(df)
    df_modified = data_selector.select()

    feature_selector = Feature_selector(df_modified, "generation")
    feature_to_be_dropped = ['id', 'hour', 'date', 'status', 'declare']
    X, y = feature_selector.select(feature_to_be_dropped)

    n_est = 100
    depth = 37
    model = Random_Forest()
    model.scale_and_split_data(X, y)
    model.fit(n_estimators=n_est, max_depth=depth)
    mse_train_actual, mse_test_actual = model.compute_mse_error()
    print(f"Train Error: {mse_train_actual}%")
    print(f"Test Error: {mse_test_actual}%")


Train Error: 1.1106081134413648%
Test Error: 2.8997745725003723%


In [2]:
    # This Cell adds model estimate as a column of Data
    
    df2 = pd.read_csv("U:/ML_project/bargh/data/processed/integrated.csv", encoding='utf-8')
    df2.loc[~df2['status'].isin(['SO', 'LF1']), 'status'] = 'SO'
    df2.loc[~df2['value'].isin(['P']), 'value'] = 'P'
    df2.drop(columns=feature_to_be_dropped, axis=1, inplace=True)

    categorical_cols = df2.select_dtypes(include=['object', 'category']).columns
    df2 = pd.get_dummies(df2, columns=categorical_cols, drop_first=True)

    X_all = df2.drop(columns=["generation"])
    y_all = df2["generation"]

    y_pred = model.pred(X_all)
    df3 = pd.read_csv("U:/ML_project/bargh/data/processed/integrated.csv", encoding='utf-8')
    df3['prediction'] = y_pred
    df3.to_csv('U:/ML_project/bargh/data/processed/with_prediction.csv', index=False)