## We will try training different model and compare their perfomance using Expirement tracking in ML FLOW and we will include only the code of the model which performs better and the log file and video comparison of all the models is in github repo which shows ML FLOW UI 

In [1]:

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Load dataset
file_path = "TASK2.csv"
df = pd.read_csv(file_path)

# Separate features and target
df.drop(columns=['hsi_id'], inplace=True)
X = df.drop(columns=['vomitoxin_ppb'])
y = df['vomitoxin_ppb']

# Outlier Detection and Removal
def remove_outliers_iqr(df, columns, threshold=1.5):
    """Removes outliers using the IQR method."""
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

X_no_outliers = remove_outliers_iqr(X, X.columns)
y_no_outliers = y[X_no_outliers.index]

# Scale the spectral data
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_no_outliers)

# Log-transform the target variable
y_transformed = np.log1p(y_no_outliers)

# Feature Selection using Random Forest
rf_temp = RandomForestRegressor(n_estimators=100, random_state=42)
rf_temp.fit(X_scaled, y_transformed)
feature_importances = rf_temp.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_no_outliers.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
selected_features = feature_importance_df['Feature'].head(50).tolist()
X_selected = X_scaled[:, [X_no_outliers.columns.get_loc(col) for col in selected_features]]

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=kf,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search.fit(X_selected, y_transformed)
best_rf_model = grid_search.best_estimator_

# Cross-Validation with Best Model
scores_mae = cross_val_score(best_rf_model, X_selected, y_transformed, cv=kf, scoring='neg_mean_absolute_error')
scores_mse = cross_val_score(best_rf_model, X_selected, y_transformed, cv=kf, scoring='neg_mean_squared_error')
scores_r2 = cross_val_score(best_rf_model, X_selected, y_transformed, cv=kf, scoring='r2')

mean_mae = -scores_mae.mean()
mean_mse = -scores_mse.mean()
mean_r2 = scores_r2.mean()

# Start MLflow Experiment
with mlflow.start_run(run_name="Random_Forest_Outlier_FS_Tune_Log_Transform"):
    mlflow.log_metric("CV_MAE", mean_mae)
    mlflow.log_metric("CV_MSE", mean_mse)
    mlflow.log_metric("CV_R2", mean_r2)
    mlflow.sklearn.log_model(best_rf_model, "Random_Forest_Outlier_FS_Tune_Log_Transform")

# Save Model and Preprocessing Artifacts
joblib.dump(scaler, "scaler.pkl")
joblib.dump(selected_features, "selected_features.pkl")
mlflow.sklearn.save_model(best_rf_model, "rf_model")

print("Model, Scaler, and Selected Features saved successfully!")
print(f"CV MAE: {mean_mae}")
print(f"CV MSE: {mean_mse}")
print(f"CV R2 Score: {mean_r2}")




Model, Scaler, and Selected Features saved successfully!
CV MAE: 1.9350276179633414
CV MSE: 6.545835174345885
CV R2 Score: 0.22038883967208775


In [2]:
ls mlruns


 Volume in drive C has no label.
 Volume Serial Number is B294-F0F6

 Directory of C:\Users\tanishq\mlruns

03/10/2025  09:48 PM    <DIR>          .
03/10/2025  09:48 PM    <DIR>          ..
03/10/2025  09:43 PM    <DIR>          .trash
03/11/2025  02:01 AM    <DIR>          0
03/10/2025  09:48 PM    <DIR>          models
               0 File(s)              0 bytes
               5 Dir(s)  38,051,876,864 bytes free


In [3]:
import subprocess
subprocess.Popen(["mlflow", "ui"], shell=True)



<Popen: returncode: None args: ['mlflow', 'ui']>

In [None]:
!python -m mlflow ui --port 5000 --host 127.0.0.1


# Observations on Model Performance
## Despite applying outlier removal, feature selection, and hyperparameter tuning, the model’s performance did not improve significantly. This could be due to the limited dataset size (500 samples), high dimensionality (450 features), and potential noise in the data. Further improvements may require more data collection, advanced feature extraction, or deep learning approaches. This notebook documents the best possible efforts to optimize performance given the dataset constraints.