# Model Training - Random Forest Keyword Classifier

This notebook trains and evaluates a Random Forest model for keyword relevance classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import sys
sys.path.append('../..')

sns.set_style('whitegrid')

## Load Training Data

In [None]:
# Load feedback data
df = pd.read_csv('../../data/feedback.csv')
print(f"Total samples: {len(df)}")
print(f"Approval rate: {df['label'].mean():.2%}")

# Prepare features and labels
feature_cols = ['length', 'yake_score', 'f1_wfreq', 'f2_wcase', 'f3_wpos', 'f4_wrel', 'f5_wspread']
X = df[feature_cols].values
y = df['label'].values

print(f"\nFeature shape: {X.shape}")
print(f"Label shape: {y.shape}")

## Train Baseline Model

In [None]:
# Train Random Forest with default parameters from config
from ml.config import MODEL_PARAMS

model = RandomForestClassifier(**MODEL_PARAMS)
model.fit(X, y)

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=min(5, len(X)), scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

## Feature Importance

In [None]:
# Plot feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print(importance_df)

## Hyperparameter Tuning (Optional)

In [None]:
# Grid search for best parameters (uncomment if you want to experiment)
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 15, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(
#     RandomForestClassifier(random_state=42),
#     param_grid,
#     cv=min(5, len(X)),
#     scoring='accuracy',
#     n_jobs=-1
# )
# grid_search.fit(X, y)

# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best CV score: {grid_search.best_score_:.3f}")
# model = grid_search.best_estimator_

## Training Performance

In [None]:
# Predictions on training data
y_pred = model.predict(X)

# Classification report
print("Classification Report:")
print(classification_report(y, y_pred, target_names=['Rejected', 'Approved']))

# Confusion matrix
cm = confusion_matrix(y, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Rejected', 'Approved'], yticklabels=['Rejected', 'Approved'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## Save Model

In [None]:
# Save trained model
from datetime import datetime
import os

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f'../models/rf_model_{timestamp}.pkl'
os.makedirs('../models', exist_ok=True)
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")
print(f"Number of samples: {len(X)}")
print(f"CV Accuracy: {cv_scores.mean():.3f}")

## Model Summary

In [None]:
# Create training summary
summary = {
    'timestamp': timestamp,
    'num_samples': len(X),
    'num_features': X.shape[1],
    'cv_accuracy': cv_scores.mean(),
    'cv_std': cv_scores.std(),
    'model_params': MODEL_PARAMS,
    'top_features': importance_df.head(3)['feature'].tolist()
}

print("\n=== Training Summary ===")
for key, value in summary.items():
    print(f"{key}: {value}")