In [None]:
# 'dataset' holds the input data for this script

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Load dataset from Power BI
df = dataset  

# Drop unnecessary columns
columns_to_drop = [
    "D.[NamePostfix]", "F.[Name]", "G1.[ParameterID]", "G1.[Name]", "G1.[OperatorMessage]", 
    "G2.[ParameterID]", "G2.[Name]", "G2.[Description]", "G3.[ParameterID]", "G3.[Name]", 
    "G3.[OperatorMessage]", "A.[ParameterID]", "A.[EntryTimestamp]", "A.[DataValue]", 
    "A.[Description]", "B.[ParameterID]", "B.[EntryTimestamp]", "B.[DataValue]", 
    "B.[Description]", "C.[ParameterID]"
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

# Convert measurement values to numeric
df['C.[DataValue]'] = pd.to_numeric(df['C.[DataValue]'], errors='coerce')

# Convert timestamps to datetime and sort
df['C.[EntryTimestamp]'] = pd.to_datetime(df['C.[EntryTimestamp]'], errors='coerce')
df = df.dropna(subset=['C.[EntryTimestamp]']).sort_values(by='C.[EntryTimestamp]').reset_index(drop=True)

# Outlier Removal Using IQR
Q1 = df["C.[DataValue]"].quantile(0.25)
Q3 = df["C.[DataValue]"].quantile(0.75)
IQR = Q3 - Q1
lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
df = df[(df["C.[DataValue]"] >= lower_bound) & (df["C.[DataValue]"] <= upper_bound)].reset_index(drop=True)

# Create Lag Features (Updated: Removed DataValue_Lag1)
for lag in [3, 5]:
    df[f"DataValue_Lag{lag}"] = df["C.[DataValue]"].shift(lag)

# Create Rolling Statistics (Matching Updated Features)
df["Rolling_Mean_3"] = df["C.[DataValue]"].rolling(window=3).mean()
df["Rolling_Std_3"] = df["C.[DataValue]"].rolling(window=3).std()
df["Rolling_Mean_5"] = df["C.[DataValue]"].rolling(window=5).mean()
df["Rolling_Std_5"] = df["C.[DataValue]"].rolling(window=5).std()

# Drop rows with NaN values (due to shifting)
df = df.dropna().reset_index(drop=True)

# Create a Cycle Count
df["Cycle_Count"] = range(1, len(df) + 1)

# Define feature columns (Updated to match new model)
feature_columns = [
    "Cycle_Count", "C.[DataValue]", "DataValue_Lag3", "DataValue_Lag5",
    "Rolling_Mean_3", "Rolling_Std_3", "Rolling_Mean_5", "Rolling_Std_5"
]

# Define target variable and drop last row to avoid NaN target
df["Target_NextCycle"] = df["C.[DataValue]"].shift(-1)
df = df.dropna().reset_index(drop=True)

# Train a Random Forest Regressor (Updated Hyperparameters)
X = df[feature_columns]
y = df["Target_NextCycle"]
model = RandomForestRegressor(
    n_estimators=40,        
    max_depth=5,            
    min_samples_split=15,    
    min_samples_leaf=7,       
    max_features="sqrt",     
    bootstrap=True,          
    random_state=42
)
model.fit(X, y)

# Define how many future cycles to predict
future_cycles = 10  # Adjust as needed

# Create a DataFrame for future predictions
future_df = pd.DataFrame()
future_df["Cycle_Count"] = range(df["Cycle_Count"].max() + 1, df["Cycle_Count"].max() + 1 + future_cycles)

# Use the last known values as the starting point for predictions
last_known_values = df.iloc[-1][feature_columns].to_dict()
predicted_bores = []

# Predict future bore sizes using the trained regression model
for cycle in future_df["Cycle_Count"]:
    new_row = last_known_values.copy()
    new_row["Cycle_Count"] = cycle

    # Shift lag values forward
    for lag in [3, 5]:  # Updated to match the correct lag features
        new_row[f"DataValue_Lag{lag}"] = predicted_bores[-lag] if len(predicted_bores) >= lag else last_known_values["C.[DataValue]"]

    # Update rolling statistics dynamically
    new_row["Rolling_Mean_3"] = np.mean(predicted_bores[-3:]) if len(predicted_bores) >= 3 else last_known_values["Rolling_Mean_3"]
    new_row["Rolling_Std_3"] = np.std(predicted_bores[-3:]) if len(predicted_bores) >= 3 else last_known_values["Rolling_Std_3"]
    new_row["Rolling_Mean_5"] = np.mean(predicted_bores[-5:]) if len(predicted_bores) >= 5 else last_known_values["Rolling_Mean_5"]
    new_row["Rolling_Std_5"] = np.std(predicted_bores[-5:]) if len(predicted_bores) >= 5 else last_known_values["Rolling_Std_5"]

    # Convert to DataFrame and predict bore size
    new_X = pd.DataFrame([new_row])[feature_columns]
    predicted_bore = model.predict(new_X)[0]
    predicted_bores.append(predicted_bore)

    # Store new row values for further processing
    future_df.loc[future_df["Cycle_Count"] == cycle, "Predicted_Bore_Size"] = predicted_bore

# Compute bore size changes over time
future_df["Bore_Size_Change"] = future_df["Predicted_Bore_Size"].diff().fillna(0)

# Define wear classification function
def classify_wear(change):
    if change < 0.001:
        return "Normal Wear"
    elif 0.001 <= change < 0.005:
        return "Moderate Wear"
    else:
        return "Critical Wear"

# Assign wear labels to future cycles
future_df["Predicted_Wear_Stage"] = future_df["Bore_Size_Change"].apply(classify_wear)

# Combine actual & future data
df["Predicted_Bore_Size"] = np.nan  # Set actual cycles to NaN in prediction column
df["Predicted_Wear_Stage"] = np.nan

# Final dataset
final_df = pd.concat([df, future_df], ignore_index=True)

final_df



NameError: name 'dataset' is not defined