In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Read the preprocessed dataset
df_preprocessed = pd.read_excel('preprocessed_data.xlsx') 

# Assuming 'District' is the target variable you want to predict
X = pd.get_dummies(df_preprocessed.drop('District', axis=1), drop_first=True)
y = df_preprocessed['District']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [5, 10, 15],         # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required at each leaf node
}

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Initialize the model with the best parameters
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

# Train the model with the best parameters
best_rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = best_rf_classifier.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Testing Accuracy with GridSearchCV: {test_accuracy * 100:.2f}%')




Best Parameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Testing Accuracy with GridSearchCV: 70.83%


In [3]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

anuradhapura       0.00      0.00      0.00         1
     colombo       0.63      1.00      0.77        47
       galle       1.00      0.62      0.77         8
     gampaha       1.00      0.57      0.73        14
      jaffna       1.00      0.67      0.80         3
    kalutara       1.00      0.50      0.67         2
       kandy       1.00      0.57      0.73         7
     kegalle       0.00      0.00      0.00         7
      matale       1.00      0.33      0.50         3
      matara       0.00      0.00      0.00         2
  mullaitivu       0.00      0.00      0.00         1
   ratnapura       0.00      0.00      0.00         1

    accuracy                           0.71        96
   macro avg       0.55      0.36      0.41        96
weighted avg       0.69      0.71      0.65        96



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
