In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = pd.read_csv('NPHA-doctor-visits.csv')

# Select relevant features (Dental Health, Mental Health, Physical Health, Race, Employment)
X = dataset[['Dental Health', 'Mental Health', 'Physical Health', 'Race', 'Employment']]

# Target variable: Number of Doctors Visited
y = dataset['Number of Doctors Visited']

# Handle missing or unwanted values (-1 for "Refused" or -2 for "Not asked")
X.replace({-1: None, -2: None}, inplace=True)  # Replace -1 and -2 with NaN (missing values)

# Optionally, fill missing values with the most frequent value in each column
X = X.apply(lambda col: col.fillna(col.mode()[0]))

# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
for column in ['Dental Health', 'Mental Health', 'Physical Health', 'Race', 'Employment']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le  # Store the encoder to use it later if needed

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split the resampled data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use StratifiedKFold for cross-validation with GridSearchCV
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters from grid search
best_rf_classifier = grid_search.best_estimator_

# Train the model with the best parameters
best_rf_classifier.fit(X_train, y_train)

# Make predictions for train and test sets
y_pred_train = best_rf_classifier.predict(X_train)
y_pred_test = best_rf_classifier.predict(X_test)

# Calculate Accuracy for train and test sets
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Confusion Matrix for train and test sets
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)

# Calculate True Positive Rate (TPR) and False Positive Rate (FPR) for each class
def calculate_tpr_fpr(cm):
    tpr_fpr = {}
    for i in range(cm.shape[0]):  # Loop through each class
        TP = cm[i][i]  # True positives are the diagonal elements
        FN = sum(cm[i]) - TP  # False negatives are the sum of the row minus the TP
        FP = sum(cm[:, i]) - TP  # False positives are the sum of the column minus the TP
        TN = sum(sum(cm)) - (TP + FN + FP)  # True negatives are the total minus TP, FN, FP
        
        TPR = TP / (TP + FN) if (TP + FN) > 0 else 0  # True Positive Rate
        FPR = FP / (FP + TN) if (FP + TN) > 0 else 0  # False Positive Rate
        
        tpr_fpr[i] = {'TPR': TPR, 'FPR': FPR}
    return tpr_fpr

# Calculate TPR and FPR for the confusion matrices
tpr_fpr_train = calculate_tpr_fpr(cm_train)
tpr_fpr_test = calculate_tpr_fpr(cm_test)

# Classification Report for test and train sets
class_report_test = classification_report(y_test, y_pred_test, target_names=['1 (0-1 doctors)', '2 (2-3 doctors)', '3 (4 or more doctors)'])
class_report_train = classification_report(y_train, y_pred_train, target_names=['1 (0-1 doctors)', '2 (2-3 doctors)', '3 (4 or more doctors)'])

# Print results
print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
print(f"\nConfusion Matrix for Train Set:\n{cm_train}")
print(f"\nConfusion Matrix for Test Set:\n{cm_test}")
print(f"\nClassification Report for Test Set:\n{class_report_test}")
print(f"\nClassification Report for Train Set:\n{class_report_train}")

# Print TPR and FPR for both sets
print("\nTrue Positive Rate (TPR) and False Positive Rate (FPR) for Train Set:")
for i, metrics in tpr_fpr_train.items():
    print(f"Class {i+1}: TPR = {metrics['TPR']:.2f}, FPR = {metrics['FPR']:.2f}")

print("\nTrue Positive Rate (TPR) and False Positive Rate (FPR) for Test Set:")
for i, metrics in tpr_fpr_test.items():
    print(f"Class {i+1}: TPR = {metrics['TPR']:.2f}, FPR = {metrics['FPR']:.2f}")

# Optional: Visualize the confusion matrix for the test set
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['1 (0-1 doctors)', '2 (2-3 doctors)', '3 (4 or more doctors)'], yticklabels=['1 (0-1 doctors)', '2 (2-3 doctors)', '3 (4 or more doctors)'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Test Set')
plt.show()
