In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cuaca = pd.read_csv('weather_data.csv')
efcap = pd.read_csv('effective_capacity.csv')
production_plan = pd.read_csv('production_plan.csv')

# Modelling for Forecasting Weather

In [None]:
!pip install xgboost==2.0.3



In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [None]:
# Copy dataset to prevent changes in original dataframe
df_cuaca = cuaca.copy()

In [None]:
# making sure 'date' column is date time type, sorting values by mine id and date
df_cuaca["date"] = pd.to_datetime(df_cuaca["date"])
df_cuaca = df_cuaca.sort_values(["mine_id", "date"]).reset_index(drop=True)

# encode categorical mine_id
le = LabelEncoder()
df_cuaca["mine_id_enc"] = le.fit_transform(df_cuaca["mine_id"])

In [None]:
# function to creates feature since we use sliding windows
def create_features(df, target, lags=[1,3,7], rolls=[3,7]):
    df = df.copy()

    # time-based
    df["month"] = df["date"].dt.month
    df["week"] = df["date"].dt.isocalendar().week.astype(int)
    df["dayofyear"] = df["date"].dt.dayofyear

    # lags per mine
    for lag in lags:
        df[f"{target}_lag{lag}"] = df.groupby("mine_id")[target].shift(lag)

    # rolling averages per mine
    for r in rolls:
        df[f"{target}_roll{r}"] = df.groupby("mine_id")[target].shift(1).rolling(r).mean()

    return df

In [None]:
# function for training XGBoost model
def train_xgb(df, target):
    # make features
    df_feat = create_features(df, target)

    # drop NA after rolling
    df_feat = df_feat.dropna().reset_index(drop=True)

    # train-test split (time based)
    cutoff = int(len(df_feat) * 0.8)
    train = df_feat.iloc[:cutoff]
    test = df_feat.iloc[cutoff:]

    feature_cols = [col for col in df_feat.columns
                    if col not in ["date", target, "remark", "weather_id", "mine_id"]]

    X_train, y_train = train[feature_cols], train[target]
    X_test, y_test = test[feature_cols], test[target]

    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        objective="reg:squarederror",
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="rmse",
        tree_method="hist"
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=False,
        early_stopping_rounds=20
    )

    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    print(f"{target} MAE: {mae:.3f}")

    return model, feature_cols, df_feat

In [None]:
#training XGBoost models
models = {}
feature_sets = {}
df_feat_store = {}

targets = ["rainfall_mm", "temperature_c", "humidity_pct", "wind_speed_kmh"]

for t in targets:
    print("\nTraining model for:", t)
    model, feats, df_feat = train_xgb(df_cuaca, t)
    models[t] = model
    feature_sets[t] = feats
    df_feat_store[t] = df_feat


Training model for: rainfall_mm




rainfall_mm MAE: 0.271

Training model for: temperature_c




temperature_c MAE: 0.441

Training model for: humidity_pct




humidity_pct MAE: 2.005

Training model for: wind_speed_kmh




wind_speed_kmh MAE: 0.150


In [None]:
#forecast using already trained-models
def forecast_future(df, models, feature_sets, days_ahead=7):
    df_future = df.copy()
    last_date = df["date"].max()

    future_rows = []

    for i in range(days_ahead):
        next_date = last_date + pd.Timedelta(days=i+1)

        for mine in df["mine_id"].unique():
            row = {
                "date": next_date,
                "mine_id": mine,
                "mine_id_enc": le.transform([mine])[0],
                "month": next_date.month,
                "week": next_date.isocalendar().week,
                "dayofyear": next_date.timetuple().tm_yday
            }

            temp_df = pd.concat([df_future, pd.DataFrame([row])], ignore_index=True)

            for target in targets:
                temp_df = create_features(temp_df, target)

                # ambil baris terakhir untuk predict
                pred_input = temp_df.iloc[-1:][feature_sets[target]]

                # predict
                row[target] = models[target].predict(pred_input)[0]

            #df_future = pd.concat([df_future, pd.DataFrame(row)], ignore_index=True, axis = 1)
            future_rows.append(row)

    return pd.DataFrame(future_rows)

In [None]:
# forecast 7 days ahead for each mine based on last day of dataset
df_pred_weather = forecast_future(df_cuaca, models, feature_sets, days_ahead=7)
df_pred_weather.head()

Unnamed: 0,date,mine_id,mine_id_enc,month,week,dayofyear,rainfall_mm,temperature_c,humidity_pct,wind_speed_kmh
0,2025-11-08,MINE_1,0,11,45,312,0.505132,25.413834,68.557098,0.214318
1,2025-11-08,MINE_2,1,11,45,312,0.411516,25.164579,68.466515,0.266288
2,2025-11-08,MINE_3,2,11,45,312,0.668217,25.932009,64.65519,0.239608
3,2025-11-08,MINE_4,3,11,45,312,0.604666,25.563963,68.584282,0.297618
4,2025-11-08,MINE_5,4,11,45,312,0.545919,25.719606,68.178108,0.309793


## Saving XGBoost Models for Forecasting Weather's Features

In [None]:
saving_path = 'path'

In [None]:
import os
import pickle
import json # Ensure json is imported for saving model_paths

output_dir = f'{saving_path}/Hasil Model'
os.makedirs(output_dir, exist_ok=True)

model_paths = {}
for target, model in models.items():
    model_filename = f'{output_dir}/forecast_weather_xgboost_{target}.pkl'
    # Use pickle.dump to save the model in .pkl format
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    model_paths[target] = model_filename

# Save the dictionary of model paths to a JSON file
with open(f'{output_dir}/model_paths.json', 'w') as f:
    json.dump(model_paths, f, indent=4)

print(f"All models and their paths have been saved to '{output_dir}' directory.")
print(f"Model paths mapping saved to '{output_dir}/model_paths.json'")

All models and their paths have been saved to '/content/drive/MyDrive/Asah-Capstone Project!/Dataset Thingy/Modelling/Hasil Model' directory.
Model paths mapping saved to '/content/drive/MyDrive/Asah-Capstone Project!/Dataset Thingy/Modelling/Hasil Model/model_paths.json'


# Weather Classification by Features Using XGBoost Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
df = pd.read_csv("weather_data.csv")

In [None]:
# Removing remark, weather_id, and mine_id columns for features
X = df.drop(columns=["remark", "weather_id", "mine_id"])
y = df["remark"]

# Encoding target variable to ordinal
le_remark = LabelEncoder()
y_encoded = le_remark.fit_transform(y)

# Encoding categorical features to ordinal
X_encoded = X.copy()
encoders = {}

for col in X_encoded.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    encoders[col] = le

In [None]:
# Data splitting train:test 80:20
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
# Training XGboost classifier and Hyperparameter Tuning with GridSerachCV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(
    objective='multi:softmax',
    num_classes = len(le_remark.classes_),
    eval_metric='mlogloss',
    random_state=42
)

param_grid = {
    'max_depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [150, 250],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid = GridSearchCV(
    xgb,
    param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


|   iter    |  target   |   depth   | learni... | l2_lea... | baggin... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m1.0      [39m | [39m5.6217808[39m | [39m0.1906357[39m | [39m7.5879454[39m | [39m5.9865848[39m |
| [39m2        [39m | [39m1.0      [39m | [39m4.0921304[39m | [39m0.0396389[39m | [39m1.5227525[39m | [39m8.6617614[39m |
| [39m3        [39m | [39m1.0      [39m | [39m7.2078050[39m | [39m0.1445337[39m | [39m1.1852604[39m | [39m9.6990985[39m |
| [39m4        [39m | [39m1.0      [39m | [39m8.8270984[39m | [39m0.0503444[39m | [39m2.6364247[39m | [39m1.8340450[39m |
| [39m5        [39m | [39m1.0      [39m | [39m5.1296957[39m | [39m0.1097037[39m | [39m4.8875051[39m | [39m2.9122914[39m |
| [39m6        [39m | [39m1.0      [39m | [39m9.9497178[39m | [39m0.1399998[39m | [39m9.8658597[39m | [39m0.0546332[39m |
| [39m7        [39m | [39m1.0      [39m | [

KeyboardInterrupt: 

In [None]:
# Predict data test using xgboost model
y_pred = grid.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le_remark.classes_))

## Saving Weather Classification Model

In [None]:
# Saving XGBoost Model and feture and target encoders
pickle.dump(grid.best_estimator_, open("xgb_weather_classification.pkl", "wb"))
pickle.dump(le_remark, open("label_encoder_target.pkl", "wb"))
pickle.dump(encoders, open("feature_encoders.pkl", "wb"))

print("dah selesai")

# Effective Capacity Prediction Model

In [None]:
df_efcap = efcap.copy()

df_efcap

Unnamed: 0,effcap_id,plan_id,mine_id,equipment_id,equipment_type,week_start,road_condition,weather_condition,availability_pct,effective_capacity_ton_day,remark
0,EFC00000-1,PLAN0001-1,MINE_1,EQ037-1,Excavator,2019-01-01,Fair,Mendung,73,29.46,Optimal
1,EFC00001-1,PLAN0002-1,MINE_1,EQ037-1,Excavator,2019-01-08,Good,Mendung,96,44.30,Optimal
2,EFC00002-1,PLAN0003-1,MINE_1,EQ033-1,Loader,2019-01-15,Good,Mendung,93,35.15,Optimal
3,EFC00003-1,PLAN0004-1,MINE_1,EQ007-1,Dump Truck,2019-01-22,Fair,Mendung,93,1531.46,Optimal
4,EFC00004-1,PLAN0005-1,MINE_1,EQ017-1,Loader,2019-02-01,Good,Mendung,87,41.20,Optimal
...,...,...,...,...,...,...,...,...,...,...,...
1195,EFC01195-1,PLAN0336-1,MINE_1,EQ027-1,Excavator,2025-12-22,Good,Mendung,98,399.23,Optimal
1196,EFC01196-4,PLAN0192-4,MINE_4,EQ026-4,Excavator,2025-12-22,Good,Mendung,87,218.14,Optimal
1197,EFC01197-2,PLAN0288-2,MINE_2,EQ049-2,Loader,2025-12-22,Good,Mendung,71,26.07,Optimal
1198,EFC01198-3,PLAN0240-3,MINE_3,EQ039-3,Excavator,2025-12-22,Fair,Mendung,86,50.64,Optimal


In [None]:
# drop columns that arent used
df_efcap.drop(columns=['effcap_id','equipment_id', 'remark', 'plan_id', 'week_start'], inplace=True)
df_efcap

Unnamed: 0,mine_id,equipment_type,road_condition,weather_condition,availability_pct,effective_capacity_ton_day
0,MINE_1,Excavator,Fair,Mendung,73,29.46
1,MINE_1,Excavator,Good,Mendung,96,44.30
2,MINE_1,Loader,Good,Mendung,93,35.15
3,MINE_1,Dump Truck,Fair,Mendung,93,1531.46
4,MINE_1,Loader,Good,Mendung,87,41.20
...,...,...,...,...,...,...
1195,MINE_1,Excavator,Good,Mendung,98,399.23
1196,MINE_4,Excavator,Good,Mendung,87,218.14
1197,MINE_2,Loader,Good,Mendung,71,26.07
1198,MINE_3,Excavator,Fair,Mendung,86,50.64


In [None]:
# Encoding categorical features using label encoder to ordinal
from sklearn.preprocessing import LabelEncoder
import joblib

cat_columns = df_efcap.select_dtypes(include=['object']).columns

encoders = {}  # dictionary to store encoders per column

for col in cat_columns:
    le = LabelEncoder()
    df_efcap[col] = le.fit_transform(df_efcap[col])
    encoders[col] = le

['/content/drive/MyDrive/Asah-Capstone Project!/Dataset Thingy/Modelling/encoders_efcap.pkl']

In [None]:
encoders

{'mine_id': LabelEncoder(),
 'equipment_type': LabelEncoder(),
 'road_condition': LabelEncoder(),
 'weather_condition': LabelEncoder()}

In [None]:
# Splitting dataset into train and test 80:20
from sklearn.model_selection import train_test_split

X = df_efcap.drop(columns=['effective_capacity_ton_day'])
y = df_efcap['effective_capacity_ton_day']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 88)

In [None]:
# Training Random Forest Regressor for effective capacity estimation
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
# predict
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R2: {r2}')

MSE: 32341.20111021135
R2: 0.8930227876653893


## Saving Effective Capacity Model

In [None]:
# Save the encoders dictionary
joblib.dump(encoders, f"{saving_path}/encoders_efcap.pkl")

In [None]:
import pickle
import os

# Save the trained model to a .pkl file
with open(f'{saving_path}/tuning_effcap_ranfor_regression.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

# Production Plan Model

In [None]:
# merging production plan and effective capacity dataset
df = pd.merge(production_plan,
              efcap,
              on='plan_id',
              how='inner')

In [None]:
# Selecting features used for modelling
features = ['road_condition', 'weather_condition', 'availability_pct', 'effective_capacity_ton_day', 'planned_output_ton']
X = df[features]
y = df['actual_output_ton']

In [None]:
#import label encoder
from sklearn.preprocessing import LabelEncoder

In [None]:
# Encoding road and weather condition
le_road = LabelEncoder()
X['road_condition'] = le_road.fit_transform(X['road_condition'])

le_weather = LabelEncoder()
X['weather_condition'] = le_weather.fit_transform(X['weather_condition'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['road_condition'] = le_road.fit_transform(X['road_condition'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['weather_condition'] = le_weather.fit_transform(X['weather_condition'])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Pisah train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Kolom kategori & numerik
cat_cols = ['road_condition', 'weather_condition']
num_cols = ['availability_pct','effective_capacity_ton_day','planned_output_ton']

# Preprocessor for encoding categorical variable to dummy variables and scalling numerical features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', StandardScaler(), num_cols)
])

In [None]:
# Training and prediction production plan prediction using random forrest regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R2 Score: 0.8208232320105721
RMSE: 15204.044979428378


## Saving Prod Plan Model

In [None]:
encoders_prod_plan = [le_road, le_weather]

joblib.dump(encoders_prod_plan, f"{saving_path}/Hasil Model/encoders_prodplan.pkl")

['/content/drive/MyDrive/Asah-Capstone Project!/Dataset Thingy/Modelling/Hasil Model/encoders_prodplan.pkl']