In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset with the specified decimal separator
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MiningProcess_Flotation_Plant_Database.csv', decimal=',')

# Sample a fraction of the data for faster testing
data_sampled = data.sample(frac=0.1, random_state=42)

# Convert the time-related column to datetime if it exists
if 'date' in data_sampled.columns:
    data_sampled['date'] = pd.to_datetime(data_sampled['date'])

# Ensure '% Silica Concentrate' is present and in numeric form
if '% Silica Concentrate' in data_sampled.columns:
    feature_columns = [col for col in data_sampled.columns if col not in ['date', '% Silica Concentrate']]
    X = data_sampled[feature_columns].astype('float32')
    y = data_sampled['% Silica Concentrate'].astype('float32')

    # Drop rows with missing values in either features or target
    data_cleaned = pd.concat([X, y], axis=1).dropna()
    X = data_cleaned[feature_columns]
    y = data_cleaned['% Silica Concentrate']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Configure the model with fewer estimators and parallel processing
    model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')
else:
    print("'% Silica Concentrate' column not found or not numeric after preprocessing.")


Mean Squared Error: 0.02682910353869957
