In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor


# Load dataset from Power BI (or from SQL if used in Power BI)
df = dataset  # Power BI automatically loads data


def drop_unnecessary_columns(df):
    """Removes unnecessary columns from the dataset if they exist."""
    columns_to_drop = [
        "D.[NamePostfix]", "F.[Name]", "G1.[ParameterID]", "G1.[Name]", "G1.[OperatorMessage]",
        "G2.[ParameterID]", "G2.[Name]", "G2.[Description]", "G3.[ParameterID]", "G3.[Name]",
        "G3.[OperatorMessage]", "A.[ParameterID]", "A.[EntryTimestamp]", "A.[DataValue]",
        "A.[Description]", "B.[ParameterID]", "B.[EntryTimestamp]", "B.[DataValue]",
        "B.[Description]", "C.[ParameterID]"
    ]
    return df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors="ignore")


def remove_outliers(df, column):
    """Removes outliers using the Interquartile Range (IQR) method."""
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]


# Data Cleaning and Preprocessing
df = drop_unnecessary_columns(df)
df["C.[DataValue]"] = pd.to_numeric(df["C.[DataValue]"], errors="coerce")
df["C.[EntryTimestamp]"] = pd.to_datetime(df["C.[EntryTimestamp]"], errors="coerce")
df = df.dropna(subset=["C.[EntryTimestamp]"]).sort_values(by="C.[EntryTimestamp]").reset_index(drop=True)
df = remove_outliers(df, "C.[DataValue]").reset_index(drop=True)


# Feature Engineering
def create_lag_features(df, column, lags):
    """Generates lag features for the specified column."""
    for lag in lags:
        df[f"{column}_lag{lag}"] = df[column].shift(lag)
    return df


def create_rolling_features(df, column, windows):
    """Generates rolling mean and standard deviation features."""
    for window in windows:
        df[f"{column}_rolling_mean_{window}"] = df[column].rolling(window=window).mean()
        df[f"{column}_rolling_std_{window}"] = df[column].rolling(window=window).std()
    return df


# Apply feature engineering
df = create_lag_features(df, "C.[DataValue]", lags=[1, 2, 3, 5])
df = create_rolling_features(df, "C.[DataValue]", windows=[3, 5])
df = df.dropna().reset_index(drop=True)
df["cycle_count"] = range(1, len(df) + 1)


# Define feature columns
feature_columns = [
    "cycle_count", "C.[DataValue]", "C.[DataValue]_lag1", "C.[DataValue]_lag2",
    "C.[DataValue]_lag3", "C.[DataValue]_lag5", "C.[DataValue]_rolling_mean_3",
    "C.[DataValue]_rolling_std_3", "C.[DataValue]_rolling_mean_5", "C.[DataValue]_rolling_std_5"
]

# Define target variable
df["target_next_cycle"] = df["C.[DataValue]"].shift(-1)
df = df.dropna().reset_index(drop=True)

# Train a Random Forest Regressor
X = df[feature_columns]
y = df["target_next_cycle"]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)


def generate_future_predictions(model, df, feature_columns, future_cycles=10):
    """
    Predicts bore sizes for future cycles using the trained model.
    """
    future_df = pd.DataFrame()
    future_df["cycle_count"] = range(df["cycle_count"].max() + 1, df["cycle_count"].max() + 1 + future_cycles)

    last_known_values = df.iloc[-1][feature_columns].to_dict()
    predicted_bores = []

    for cycle in future_df["cycle_count"]:
        new_row = last_known_values.copy()
        new_row["cycle_count"] = cycle

        for lag in [1, 2, 3, 5]:
            new_row[f"C.[DataValue]_lag{lag}"] = (
                predicted_bores[-lag] if len(predicted_bores) >= lag else last_known_values["C.[DataValue]"]
            )

        for window in [3, 5]:
            new_row[f"C.[DataValue]_rolling_mean_{window}"] = (
                np.mean(predicted_bores[-window:]) if len(predicted_bores) >= window else last_known_values[f"C.[DataValue]_rolling_mean_{window}"]
            )
            new_row[f"C.[DataValue]_rolling_std_{window}"] = (
                np.std(predicted_bores[-window:]) if len(predicted_bores) >= window else last_known_values[f"C.[DataValue]_rolling_std_{window}"]
            )

        new_X = pd.DataFrame([new_row])[feature_columns]
        predicted_bore = model.predict(new_X)[0]
        predicted_bores.append(predicted_bore)

        future_df.loc[future_df["cycle_count"] == cycle, "predicted_bore_size"] = predicted_bore

    future_df["bore_size_change"] = future_df["predicted_bore_size"].diff().fillna(0)

    return future_df


def classify_wear(change):
    """Classifies wear severity based on bore size change."""
    if change < 0.001:
        return "Normal Wear"
    elif 0.001 <= change < 0.005:
        return "Moderate Wear"
    return "Critical Wear"


# Generate predictions and classify wear stages
future_cycles = 10
future_df = generate_future_predictions(model, df, feature_columns, future_cycles)
future_df["predicted_wear_stage"] = future_df["bore_size_change"].apply(classify_wear)

# Combine actual & future data
df["predicted_bore_size"] = np.nan
df["predicted_wear_stage"] = np.nan
final_df = pd.concat([df, future_df], ignore_index=True)

final_df


NameError: name 'dataset' is not defined