In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Load dataset
url = 'https://drive.google.com/uc?id=1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ'
data = pd.read_csv(url)

# Q1. Preprocess the dataset
print("Q1. Preprocess the dataset")

# Display first few rows
print(data.head())

# Handling missing values
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encoding categorical variables
label_encoders = {}
for column in data_imputed.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_imputed[column] = le.fit_transform(data_imputed[column])
    label_encoders[column] = le

# Scaling numerical features
scaler = StandardScaler()
numerical_features = data_imputed.select_dtypes(include=['float64', 'int64']).columns
data_imputed[numerical_features] = scaler.fit_transform(data_imputed[numerical_features])

# Display processed data
print(data_imputed.head())

# Q2. Split the dataset into training and test set
print("\nQ2. Split the dataset into training and test set")

X = data_imputed.drop('target', axis=1)  # Replace 'target' with the actual target column name
y = data_imputed['target']  # Replace 'target' with the actual target column name

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Q3. Train a Random Forest Classifier
print("\nQ3. Train a Random Forest Classifier")

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

# Q4. Evaluate the performance of the model
print("\nQ4. Evaluate the performance of the model")

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Q5. Feature importance analysis
print("\nQ5. Feature Importance Analysis")

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

top_n = 5
top_features = indices[:top_n]
feature_names = X.columns

plt.figure(figsize=(10, 6))
plt.title("Top 5 Important Features")
sns.barplot(x=importances[top_features], y=feature_names[top_features], palette="viridis")
plt.xlabel('Feature Importance')
plt.show()

# Q6. Hyperparameter tuning with GridSearchCV
print("\nQ6. Hyperparameter Tuning with GridSearchCV")

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")
best_rf = grid_search.best_estimator_

# Evaluate the tuned model
y_pred_tuned = best_rf.predict(X_test)

accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

print(f"\nTuned Model Accuracy: {accuracy_tuned:.2f}")
print(f"Tuned Model Precision: {precision_tuned:.2f}")
print(f"Tuned Model Recall: {recall_tuned:.2f}")
print(f"Tuned Model F1 Score: {f1_tuned:.2f}")

# Q7. Compare performance of tuned model with default model
print("\nQ7. Comparing performance of tuned model with default model")
print(f"Default Model - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")
print(f"Tuned Model - Accuracy: {accuracy_tuned:.2f}, Precision: {precision_tuned:.2f}, Recall: {recall_tuned:.2f}, F1 Score: {f1_tuned:.2f}")

# Q8. Interpret the model by analyzing decision boundaries
print("\nQ8. Interpret the model by analyzing decision boundaries")

# Reduce to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Train a new Random Forest on 2D data
rf_2d = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_2d.fit(X_train_pca, y_train_pca)

# Plot decision boundaries
xx, yy = np.meshgrid(np.arange(X_pca[:, 0].min()-1, X_pca[:, 0].max()+1, 0.01),
                     np.arange(X_pca[:, 1].min()-1, X_pca[:, 1].max()+1, 0.01))
Z = rf_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdYlBu)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, edgecolor='k', cmap=plt.cm.RdYlBu)
plt.title("Decision Boundaries of Random Forest Classifier")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()
