# AAP AI Model for Sample Categorization - CLEAN VERSION

**Admin Number**: 230327F  
**Module**: IT3100 - AI Applications Project  
**Jason Hong Jie Sen**

---

This is a clean version of the notebook without Git merge conflicts.

## Data Preparation and Cleaning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pickle

# Load data
train_df = pd.read_csv(r'C:\Users\jason\OneDrive\Documents\2025 Sem1 Study\IT3100_AAP\AAP_Dataset (Supply Chain)\Train_Set.csv')
test_df = pd.read_csv(r'C:\Users\jason\OneDrive\Documents\2025 Sem1 Study\IT3100_AAP\AAP_Dataset (Supply Chain)\Test_Set.csv')

# Sample data for faster processing
train_df = train_df.sample(n=3000, random_state=42)
test_df = test_df.sample(n=1000, random_state=42)

# Drop unnecessary columns
train_df = train_df.drop(['OrderId', 'Customer_Id', 'Dept_Id', 'Zipcode', 'Prod_Category_Id', 'CategoryName'], axis=1)
test_df = test_df.drop(['OrderId', 'Customer_Id', 'Dept_Id', 'Zipcode', 'Prod_Category_Id', 'CategoryName'], axis=1)

# Clean columns
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

# Handle missing values and duplicates
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

# Standardize string columns
str_cols = ['Product_type', 'Customer_Category', 'Dept_Name', 'Shipping_Class', 'Warehouse_Region', 'Order_zone']
for col in str_cols:
    train_df[col] = train_df[col].str.strip().str.lower()
    test_df[col] = test_df[col].str.strip().str.lower()

print("Data preprocessing completed!")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

In [None]:
# Encode target variable
label_encoder = LabelEncoder()
train_df['Dept_Name_encoded'] = label_encoder.fit_transform(train_df['Dept_Name'])

# Handle low-support classes
low_support_classes = [1, 2, 4, 7, 9, 10]
train_df['Dept_Name_encoded'] = train_df['Dept_Name_encoded'].apply(lambda x: 'Other' if x in low_support_classes else x)
train_df['Dept_Name_encoded'] = train_df['Dept_Name_encoded'].astype(str)
train_df['Dept_Name_encoded'] = label_encoder.fit_transform(train_df['Dept_Name_encoded'])

# Select features
features_to_keep = ['Price', 'Sales', 'Order_Profit', 'ProductWeight', 'Quantity']
X = train_df[features_to_keep]
y = train_df['Dept_Name_encoded']
X_test = test_df[features_to_keep]

print("Feature selection completed!")
print(f"Features: {features_to_keep}")
print(f"Target classes: {len(set(y))}")

In [None]:
# Split training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train Gradient Boosting model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_model = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

print("Model training completed!")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate model
y_val_pred = grid_search.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix
cm = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# Feature Importance Analysis
print("\n=== FEATURE IMPORTANCE ANALYSIS ===")

if hasattr(grid_search, 'best_estimator_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': grid_search.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance Rankings:")
    print(feature_importance.to_string())
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature')
    plt.title('Feature Importance')
    plt.xlabel('Importance Score')
    plt.tight_layout()
    plt.show()
else:
    print("Model not trained yet.")

In [None]:
# Make predictions on test set
test_predictions = grid_search.predict(X_test)
test_predictions_decoded = label_encoder.inverse_transform(test_predictions)

# Create submission dataframe
predictions_df = pd.DataFrame({
    'Predicted_Dept_Name': test_predictions_decoded
})

# Save predictions
predictions_df.to_csv('predicted_dept_names.csv', index=False)
print("Predictions saved to 'predicted_dept_names.csv'")
print(f"Total predictions: {len(predictions_df)}")

In [None]:
# Save the trained model and label encoder
joblib.dump(grid_search.best_estimator_, '../model/gradient_boosting_model.pkl')

with open('../model/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Model and label encoder saved successfully!")
print("Files saved:")
print("- ../model/gradient_boosting_model.pkl")
print("- ../model/label_encoder.pkl")