# Assignment 14: Random Forest Classification

## Dataset: Glass Identification

**Topics Covered:**
- Ensemble Learning
- Random Forest vs Decision Tree
- Feature Importance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data
df = pd.read_excel('glass.xlsx')
print("Dataset loaded! Shape:", df.shape)
df.head()

In [None]:
# Prepare data
# Assuming last column is target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training:", len(X_train), "Testing:", len(X_test))

In [None]:
# Train Decision Tree (for comparison)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)
print("Decision Tree Accuracy:", round(dt_acc, 4))

In [None]:
# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print("Random Forest Accuracy:", round(rf_acc, 4))

In [None]:
# Compare models
print("=== Model Comparison ===")
print("Decision Tree:", round(dt_acc, 4))
print("Random Forest:", round(rf_acc, 4))
print("\nRandom Forest improvement:", round((rf_acc - dt_acc)*100, 2), "%")

In [None]:
# Feature Importance
importances = rf.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values('Importance', ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='steelblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.savefig('feature_importance.png')
plt.show()

In [None]:
# Classification Report
print("=== Classification Report ===")
print(classification_report(y_test, rf_pred))

## Summary

- Random Forest outperforms single Decision Tree
- Ensemble methods reduce overfitting
- Feature importance helps understand predictions