In [1]:
import sys
from pathlib import Path

PROJECT_DIR = Path('..').resolve()
DATA_DIR = PROJECT_DIR / 'data/power-plant'

In [2]:
import pandas as pd

dataset = pd.read_csv(DATA_DIR / 'power_plants.csv')
dataset

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
47835,15.12,48.92,1011.80,72.93,462.59
47836,33.41,77.95,1010.30,59.72,432.90
47837,15.99,43.34,1014.20,78.66,465.96
47838,17.65,59.87,1018.58,94.65,450.93


In [3]:
import numpy as np
dataset['Device_Id'] = np.random.randint(0, 3, dataset.shape[0])

In [4]:
dataset.head()

Unnamed: 0,AT,V,AP,RH,PE,Device_Id
0,14.96,41.76,1024.07,73.17,463.26,2
1,25.18,62.96,1020.04,59.08,444.37,1
2,5.11,39.4,1012.16,92.14,488.56,0
3,20.86,57.32,1010.24,76.64,446.48,1
4,10.82,37.5,1009.23,96.62,473.9,1


In [19]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [21]:
mlflow.set_experiment("Power Plant mlflow")

INFO: 'Power Plant mlflow' does not exist. Creating a new experiment


In [94]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

def train_fun(Dataset: pd.DataFrame) -> dict:
    device = dataset["Device_Id"].iloc[0]
    
    with mlflow.start_run(run_name= f"{device}", nested=True):
        X = Dataset[["AT","V","AP","RH"]]
        y = Dataset["PE"]
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        # m = mlflow.sklearn.log_model(model)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r_2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        mlflow.log_metrics({'rmse': mse, 'mae': mae, 'r2': r_2})
        print(f'RMSE = {mse:.2f}, MAE = {mae:.2f}, R2 = {r_2:.2f}')
    return mse, r_2, mae

In [95]:
train_fun(dataset)

2
RMSE = 0.08, MAE = 0.14, R2 = 1.00


(0.07905680137542151, 0.9997281184948631, 0.13910934364553004)

In [91]:
with mlflow.start_run(run_name="testing the model"):
    train_fun(dataset[dataset.Device_Id == 1])

RMSE = 3.47, MAE = 1.13, R2 = 0.99


In [92]:
with mlflow.start_run(run_name="Groupby device name"):
    dataset.groupby("Device_Id").apply(train_fun)

RMSE = 3.02, MAE = 1.06, R2 = 0.99
RMSE = 3.60, MAE = 1.14, R2 = 0.99
RMSE = 3.79, MAE = 1.12, R2 = 0.99


In [93]:
with mlflow.start_run(run_name="MLflow test"):
    for i in range(3):
        result = train_fun(dataset[dataset["Device_Id"] == i])
        print(result)

RMSE = 3.15, MAE = 1.07, R2 = 0.99
(3.1486588165990614, 0.9891730802956553, 1.0706565366614584)
RMSE = 3.55, MAE = 1.15, R2 = 0.99
(3.549176980381937, 0.9879843358868837, 1.1533560921717307)
RMSE = 3.80, MAE = 1.12, R2 = 0.99
(3.795351743177971, 0.9870804749766342, 1.122086205817946)


# Dividing the dataset using distrubution and random sampling 

In [55]:
dataset_lower_distrubution = dataset[dataset["PE"] < 450]

In [57]:
index = np.random.rand(len(dataset_lower_distrubution))
index

array([0.22056988, 0.45601625, 0.45976602, ..., 0.43841309, 0.3033041 ,
       0.45296419])

In [58]:
mask = index < 0.5
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [59]:
dataset_march = dataset_lower_distrubution[mask]
dataset_april = dataset_lower_distrubution[~mask]

In [67]:
print("Oraginal dataset",dataset.shape, " -------  Split for April",dataset_april.shape," ---------- Split for March", dataset_march.shape)

Oraginal dataset (47840, 6)  -------  Split for April (11374, 6)  ---------- Split for March (11401, 6)


In [68]:
dataset_higher_distrubution = dataset[dataset["PE"] >= 450]
high_mask = np.random.rand(len(dataset_higher_distrubution)) < 0.5

In [None]:
dataset_may = dataset_higher_distrubution[high_mask]
dataset_june = data

In [96]:
def mapping(month_df: pd.DataFrame, month: str):
    with mlflow.start_run(run_name=month):
        dataset_march.apply(train_fun)