In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, f1_score)
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
print("="*70)
print("FOREST COVER TYPE PREDICTION")
print("="*70)

df = pd.read_csv('train.csv')
print(f"\nDataset: {df.shape[0]} samples, {df.shape[1]} features")

print("\nTarget distribution:")
for i in range(1, 8):
    count = (df['Cover_Type'] == i).sum()
    print(f"Class {i}: {count} ({count/len(df)*100:.1f}%)")

df['Distance_To_Hydrology'] = np.sqrt(
    df['Horizontal_Distance_To_Hydrology']**2 +
    df['Vertical_Distance_To_Hydrology']**2
)

df['Mean_Distance'] = (
    df['Horizontal_Distance_To_Hydrology'] +
    df['Horizontal_Distance_To_Roadways'] +
    df['Horizontal_Distance_To_Fire_Points']
) / 3

df['Mean_Hillshade'] = (
    df['Hillshade_9am'] +
    df['Hillshade_Noon'] +
    df['Hillshade_3pm']
) / 3

df['Hillshade_Range'] = (
    df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].max(axis=1) -
    df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].min(axis=1)
)

if 'Id' in df.columns:
    df = df.drop('Id', axis=1)

X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain: {X_train.shape[0]} | Test: {X_test.shape[0]}")

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
y.value_counts().sort_index().plot(kind='bar', color='forestgreen', edgecolor='black')
plt.title('Cover Type Distribution', fontweight='bold')
plt.xlabel('Cover Type')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.3)

plt.subplot(1, 2, 2)
continuous = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology']
sns.heatmap(df[continuous].corr(), annot=True, cmap='RdYlGn', center=0, square=True)
plt.title('Feature Correlation', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*70)
print("TRAINING MODELS")
print("="*70)

models = {}

print("\n[1] Random Forest")
rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=10,
                           random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
rf_acc = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')
models['Random Forest'] = {'model': rf, 'accuracy': rf_acc, 'f1': rf_f1, 'pred': rf_pred}
print(f"Accuracy: {rf_acc:.4f} | F1: {rf_f1:.4f}")

print("\n[2] Gradient Boosting")
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=6,
                               random_state=42)
gb.fit(X_train_scaled, y_train)
gb_pred = gb.predict(X_test_scaled)
gb_acc = accuracy_score(y_test, gb_pred)
gb_f1 = f1_score(y_test, gb_pred, average='weighted')
models['Gradient Boosting'] = {'model': gb, 'accuracy': gb_acc, 'f1': gb_f1, 'pred': gb_pred}
print(f"Accuracy: {gb_acc:.4f} | F1: {gb_f1:.4f}")


In [None]:
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)

results = pd.DataFrame({
    'Model': list(models.keys()),
    'Accuracy': [models[m]['accuracy'] for m in models.keys()],
    'F1-Score': [models[m]['f1'] for m in models.keys()]
})

print("\n" + results.to_string(index=False))

best_model = results.loc[results['Accuracy'].idxmax(), 'Model']
best_acc = results['Accuracy'].max()

print(f"\nBest Model: {best_model}")
print(f"Accuracy: {best_acc:.4f}")

plt.figure(figsize=(10, 6))
x = np.arange(len(results))
width = 0.35

plt.bar(x - width/2, results['Accuracy'], width, label='Accuracy', color='darkgreen', edgecolor='black')
plt.bar(x + width/2, results['F1-Score'], width, label='F1-Score', color='forestgreen', edgecolor='black')

plt.xlabel('Models', fontweight='bold')
plt.ylabel('Score', fontweight='bold')
plt.title('Model Performance Comparison', fontweight='bold')
plt.xticks(x, results['Model'])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.ylim([0.8, 0.9])
plt.tight_layout()
plt.show()

In [None]:
cover_types = {
    1: 'Spruce/Fir', 2: 'Lodgepole Pine', 3: 'Ponderosa Pine',
    4: 'Cottonwood/Willow', 5: 'Aspen', 6: 'Douglas-fir', 7: 'Krummholz'
}

best_pred = models[best_model]['pred']

print("\n" + "="*70)
print("CLASSIFICATION REPORT")
print("="*70)
print(classification_report(y_test, best_pred,
                          target_names=[cover_types[i] for i in range(1, 8)]))

cm = confusion_matrix(y_test, best_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
           xticklabels=list(cover_types.values()),
           yticklabels=list(cover_types.values()))
plt.title(f'Confusion Matrix - {best_model}', fontweight='bold', fontsize=14)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': models[best_model]['model'].feature_importances_
}).sort_values('Importance', ascending=False)

print("\n" + "="*70)
print("FEATURE IMPORTANCE")
print("="*70)
print("\nTop 15 features:")
print(feature_importance.head(15).to_string(index=False))

plt.figure(figsize=(12, 8))
top20 = feature_importance.head(20)
plt.barh(range(len(top20)), top20['Importance'], color='forestgreen')
plt.yticks(range(len(top20)), top20['Feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances', fontweight='bold', fontsize=14)
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"""
Dataset: Roosevelt National Forest
Total Samples: {len(df):,}
Features: {X.shape[1]}
Classes: 7

Best Model: {best_model}
Test Accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)
Test F1-Score: {models[best_model]['f1']:.4f}

Performance: {'Excellent' if best_acc > 0.85 else 'Good' if best_acc > 0.80 else 'Fair'}
""")