In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from scipy.stats import randint as sp_randint
import joblib

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

________
# Data Initializing 
________

In [4]:
DATA = "covertype.csv"

In [5]:
df = pd.read_csv(DATA)

In [6]:
all_columns = df.columns

In [7]:
df.head()

In [8]:
print("\nDescriptive statistics for numerical features:")
print(df.describe())

In [9]:
# Assuming your target variable is the last column
target_column_index = df.shape[1] - 1
print("\nClass distribution of the target variable:")
print(df.iloc[:, target_column_index].value_counts())

In [10]:
missing_values = df.isnull().sum()
missing_values


________
# Scaling the Data 
________

In [11]:
# Splitting the Data into Input Feature and Result Feature 
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [12]:
print(X,'\n',y)

In [13]:
numerical_features = X.columns[:10]

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('robust', RobustScaler(), numerical_features)
    ],
    remainder='passthrough'
)

In [15]:
X_scaled = preprocessor.fit_transform(X)

In [15]:
X_scaled

In [16]:
print(f"Data Scaled successfully via Robust scaler.\nShape of the scaled data: {X_scaled.shape}")

____
# Train Test Split
____

In [17]:
y = y-1

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, 
    y, 
    test_size=0.2,
    random_state=42, 
    stratify=y
)

____
# Training Model XGBoost
___

In [18]:
model = XGBClassifier(
    objective='multi:softmax',
    n_estimators=600,
    use_label_encoder=False,
    eval_metric='mlogloss'
    
)

In [33]:
model.fit(X_train, y_train)

print("Model training Complete!!")

____
# Accuracy
___ 

In [34]:
y_pred = model.predict(X_test)

In [35]:
score = model.score(X_test, y_test)

print(f"The Score of our model is: {score:.2f}")

In [51]:
print('XGBoost Classification Reports:')
print(classification_report(y_test, y_pred))

____
# Plotting Results
___

In [36]:
conf_matrix = confusion_matrix(y_test, y_pred)

In [37]:
plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.show()

__________________

Visualizing Feature Importance

In [38]:
feature_names = X.columns.tolist()

In [39]:
feature_importances = model.feature_importances_


In [40]:
feature_importances

In [42]:
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

In [43]:
importance_df

In [44]:
# Plot the top 15 most important features
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=importance_df.head(15))
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

____
# Random Forest Classifier

In [19]:
rfc_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [47]:
rfc_model.fit(X_train, y_train)
print(f"RFC Model Trained!")

In [49]:
rfc_y_pred = rfc_model.predict(X_test)

In [60]:
rfc_score = rfc_model.score(X_test, y_test)

print(f"Thus RFC Score is: {rfc_score:.2f}")

___
## Classification reports for Both Models
___

RFC MODEL:

In [50]:
print('RFC Classification Reports:')
print(classification_report(y_test, rfc_y_pred))

XGBoost Model:

In [61]:
print('XGBoost Classification Reports:')
print(classification_report(y_test, y_pred))

___
## Comparing the Scores of both models with each other
___

In [54]:
rfc_conf_matrix = confusion_matrix(y_test, rfc_y_pred)
xgb_conf_matrix = confusion_matrix(y_test, y_pred)


___
XGBoost Model Confusion Mat

In [58]:
plt.figure(figsize=(10,8))
sns.heatmap(xgb_conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.show()

___
RFC Model Confusion Mat

In [59]:
plt.figure(figsize=(10,8))
sns.heatmap(rfc_conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel('Predicted Class')
plt.ylabel('Actual Class')
plt.show()

___
# Conclusion

- RFC MODEL:
    - Theoretically better for small data sets.
    - Accuracy predicted is around 0.96
    - Model Training Time: 12 Minutes

- XGBoost Model:
    - Theoreticallt better for large data sets like the one used in this task
    - Accuracy predicted is around 0.95
    - Model Training Time: 18 Seconds


### In a Nutshell, we will go for the XGBoost Model as it takes significantly lesser time to train the said model and to chase an imporvement of 1% is not worth the resources and the time for RFC Model 
___

___
# HyperTuning XGBoost
___

In [20]:
# Defining our search space for the hyperparameters

param_distribution = {
    'n_estimators':     sp_randint(500, 700),
    'learning_rate':    [0.05, 0.1, 0.2],
    'max_depth':        sp_randint(3,10),
    'subsample':        [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [21]:
# Using the same XGBoost Instance defined above

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distribution,
    n_iter=30,
    cv=3,
    scoring='f1_macro',
    verbose=2,
    n_jobs=-1
)

In [22]:
random_search.fit(X_train, y_train)

In [23]:
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best Cross Validation F-1 Score: {random_search.best_score_}")

In [24]:
best_model = random_search.best_estimator_

In [30]:
print(f"Score for hyper tuned XGBoost Model is: {best_model.score(X_test, y_test):.2f}")


___
# HyperTuning Random Forest Classifier
___

In [28]:
rfc_param_distributions = {
    'n_estimators': sp_randint(200, 500),
    'max_depth': sp_randint(3, 15),
    'min_samples_leaf': sp_randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

random_search_rfc = RandomizedSearchCV(
    estimator=rfc_model,
    param_distributions=rfc_param_distributions,
    n_iter=30,
    cv=3,
    scoring='f1_macro',
    verbose=2,
    n_jobs=-1
)

In [29]:
random_search_rfc.fit(X_train,y_train)

___
# Conclusion

- RFC MODEL:
    - Not feasible to perform on a Laptop as it will take Approx 19.5 hours to find the best model with 90 fits 

- XGBoost Model:
    - Best Model with 0.97 of scoring accuracy and its stats given below:
    - 'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 648, 'subsample': 0.6


### In a Nutshell, we will go for the XGBoost Model as it takes significantly lesser time to train the said model and to chase an imporvement of 1% is not worth the resources and the time for RFC Model 
___

In [37]:
pipline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", best_model)
])

In [40]:
df.head(1)

In [41]:
df.columns

In [43]:
# This is a critical step for your pipeline to work correctly
columns = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
           'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
           'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
           'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
           'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
           'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
           'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
           'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
           'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
           'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
           'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
           'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
           'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
           'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
           'Soil_Type39', 'Soil_Type40']

# Create a list of hypothetical values for a new entry
# These values are different from your training example
new_values = [
    [2900, 150, 8, 100, 20, 1500, 240, 230, 140, 4500, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
]

# Create the new DataFrame
new_entry_df = pd.DataFrame(new_values, columns=columns)

# Now you can use this DataFrame to make a prediction
# For example, if your pipeline is called 'pipeline':
# prediction = pipeline.predict(new_entry_df.drop('Cover_Type', axis=1))



In [44]:
prediction = pipline.predict(new_entry_df)

In [46]:
prediction[0]

In [48]:
joblib.dump(pipline, "Forest_Cover_Type_predictor.joblib")

In [50]:
X

In [51]:
y