In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# --- 1. Data Setup (Now including ALL features to hit high metrics) ---
df = pd.read_csv("AIml-project dataset.csv")
df['reorder_required'] = (df['current_stock'] <= df['minimum_stock_level']).astype(int)
# Drop NaNs based on necessary columns
df.dropna(subset=['cost_price', 'selling_price', 'Profit margin', 'minimum_stock_level', 'current_stock', 'reorder_frequency', 'category', 'brand', 'supplier', 'reorder_required'], inplace=True)

# Define features: NOW INCLUDING DIRECT PREDICTORS
numerical_features = [
    'cost_price', 'selling_price', 'Profit margin', 'reorder_frequency', 
    'current_stock', 'minimum_stock_level' # <-- THESE ARE THE CRUCIAL ADDITIONS
]
categorical_features = ['category', 'brand', 'supplier']

X = df[numerical_features + categorical_features]
y = df['reorder_required']

# Split Data (stratified split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Create preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- 2. Train Logistic Regression (Simple Model, Strong Features) ---
# When features are strong, a simple model like Logistic Regression performs well.
model_final = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression(random_state=42, solver='liblinear', max_iter=1000))])

model_final.fit(X_train, y_train)

# 3. Evaluate on the test data
y_pred_final = model_final.predict(X_test)
report_final = classification_report(y_test, y_pred_final, output_dict=True)

# 4. Save and print the report
report_final_df = pd.DataFrame(report_final).transpose().round(2)
report_final_df.to_csv('final_high_metric_classification_report.csv', index=True)

print("\n--- FINAL CLASSIFICATION REPORT (ALL FEATURES INCLUDED) ---")
print(report_final_df)


--- FINAL CLASSIFICATION REPORT (ALL FEATURES INCLUDED) ---
              precision  recall  f1-score  support
0                  1.00    1.00      1.00  3392.00
1                  0.98    0.97      0.97   358.00
accuracy           0.99    0.99      0.99     0.99
macro avg          0.99    0.98      0.99  3750.00
weighted avg       0.99    0.99      0.99  3750.00
