In [None]:
df.info()

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_csv('E:\Downloads\DV FINAL PROJECT\Mental_Health_Care.csv')

X = df[['Year', 'Group', 'Value']]
y = df['Indicator']

encoder = LabelEncoder()
X.loc[:, 'Group'] = encoder.fit_transform(X['Group'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features
scaler = StandardScaler()
X_train[['Year', 'Value']] = scaler.fit_transform(X_train[['Year', 'Value']])
X_test[['Year', 'Value']] = scaler.transform(X_test[['Year', 'Value']])

# random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

#  hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best parameters
best_params = grid_search.best_params_

# Initialize random forest classifier with best parameters
best_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

best_rf_classifier.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Evaluation:")
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Evaluate the model on the training set
train_pred = best_rf_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)
print("\nTraining Set Evaluation:")
print("Accuracy:", train_accuracy)
print("Classification Report:")
print(classification_report(y_train, train_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_train, train_pred))

combined_X = pd.concat([X_train, X_test])
combined_y = pd.concat([y_train, y_test])

# Train the model on the combined dataset
best_rf_classifier.fit(combined_X, combined_y)

# Evaluate the model on the combined dataset
combined_pred = best_rf_classifier.predict(combined_X)
combined_accuracy = accuracy_score(combined_y, combined_pred)
print("\nCombined Set Evaluation:")
print("Accuracy:", combined_accuracy)
print("Classification Report:")
print(classification_report(combined_y, combined_pred))
print("Confusion Matrix:")
print(confusion_matrix(combined_y, combined_pred))


Test Set Evaluation:
Accuracy: 0.584814992791927
Classification Report:
                                                                                      precision    recall  f1-score   support

                                     Needed Counseling or Therapy But Did Not Get It       0.50      0.51      0.51       505
                                                      Received Counseling or Therapy       0.53      0.50      0.51       506
                                      Took Prescription Medication for Mental Health       0.64      0.64      0.64       551
Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy       0.67      0.68      0.67       519

                                                                            accuracy                           0.58      2081
                                                                           macro avg       0.58      0.58      0.58      2081
                                            