# Running RFC with optuna for hyperparameter tuning

In [1]:
import pandas as pd
import numpy as np
combined_df = pd.read_csv('../datasets/combined_df.csv')

combined_df.head()

Unnamed: 0,subreddit,post,automated_readability_index,coleman_liau_index,flesch_kincaid_grade_level,flesch_reading_ease,gulpease_index,gunning_fog_index,lix,smog_index,...,tfidf_wish,tfidf_without,tfidf_wonder,tfidf_work,tfidf_worri,tfidf_wors,tfidf_would,tfidf_wrong,tfidf_x200b,tfidf_year
0,adhd,Lethargic/Depressed when off meds First I'll g...,5.400816,6.50584,5.555245,81.416541,68.047619,9.145306,31.706803,9.3871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.095341,0.0,0.0,0.086429
1,adhd,Concerta not working on the first day?! Update...,2.980698,5.751419,4.789892,76.862769,79.896552,8.314655,27.68319,9.017664,...,0.0,0.0,0.0,0.099106,0.0,0.0,0.290114,0.0,0.0,0.0
2,adhd,Comorbid anxiety and ADHD-PI Medication Questi...,5.136889,6.746474,6.981667,69.0525,69.148148,10.733333,34.240741,10.793553,...,0.0,0.0,0.0,0.0,0.117894,0.0,0.0,0.126925,0.0,0.0
3,adhd,Fist Day on Concerta 18mg UPDATE! Update!: Tha...,2.841137,5.750767,4.59774,76.904579,82.383459,8.13703,27.109492,8.841846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.303759,0.0,0.0,0.0
4,adhd,I absolutely hate being so motivated but equal...,5.781923,7.779519,6.42359,72.163077,68.102564,9.302564,37.358974,9.725611,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
combined_df.drop(columns=['post'], inplace=True)

## Running optuna on RFC

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import optuna
import numpy as np


X = combined_df.drop(columns=['subreddit'])

# Target variable
y = combined_df['subreddit']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the training set into training and validation sets (50% training, 50% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 5, 80)

    # Initialize the Random Forest Classifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        criterion='entropy',
        class_weight='balanced_subsample',
        random_state=42
    )

    # Fit the classifier to the training data
    clf.fit(X_train, y_train)

    # Make predictions on the validation data
    y_val_pred = clf.predict(X_val)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)

    return accuracy

# Create an Optuna study for optimization
study = optuna.create_study(direction="maximize")

# Optimize the objective function
study.optimize(objective, n_trials=50) 

# Get the best hyperparameters from the study
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Initialize the Random Forest Classifier with the best hyperparameters
best_clf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    class_weight='balanced_subsample',
    random_state=42
)


[32m[I 2023-10-24 19:48:50,781][0m A new study created in memory with name: no-name-1f871dc4-553b-46ba-aa20-c14486bfacf1[0m
[32m[I 2023-10-24 19:54:06,144][0m Trial 0 finished with value: 0.620316612626359 and parameters: {'n_estimators': 149, 'max_depth': 52}. Best is trial 0 with value: 0.620316612626359.[0m
[32m[I 2023-10-24 20:07:44,402][0m Trial 1 finished with value: 0.6231089071142476 and parameters: {'n_estimators': 390, 'max_depth': 80}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:10:22,142][0m Trial 2 finished with value: 0.6154491703223346 and parameters: {'n_estimators': 76, 'max_depth': 68}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:24:53,523][0m Trial 3 finished with value: 0.622513827961091 and parameters: {'n_estimators': 418, 'max_depth': 49}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:32:19,695][0m Trial 4 finished with value: 0.6147930574098799 and parameters: {'n_

# Results of the best model

In [None]:

# Fit the best classifier to the entire training dataset
best_clf.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = best_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Generate a classification report on the test data
class_report = classification_report(y_test, y_test_pred, zero_division=0)
print("Test Classification Report:\n", class_report)


Test Accuracy: 0.64
Test Classification Report:
                precision    recall  f1-score   support

    addiction       0.75      0.56      0.64      2270
         adhd       0.69      0.78      0.74     13753
   alcoholism       0.68      0.73      0.70      1737
      anxiety       0.80      0.72      0.76     17178
       autism       0.61      0.29      0.39      2634
          bpd       0.91      0.49      0.64      7187
   depression       0.55      0.70      0.62     35446
       lonely       0.58      0.47      0.52      6961
         ptsd       0.81      0.55      0.65      2631
schizophrenia       0.66      0.14      0.23      2673
 suicidewatch       0.59      0.58      0.59     19880

     accuracy                           0.64    112350
    macro avg       0.69      0.55      0.59    112350
 weighted avg       0.65      0.64      0.63    112350



In [None]:
# print clf.feature_importances_ in descending order of importance 
feature_importances = pd.DataFrame(best_clf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                             importance
tfidf_ptsd                     0.033360
tfidf_addict                   0.030753
tfidf_adhd                     0.030692
tfidf_bpd                      0.029676
tfidf_anxieti                  0.025313
tfidf_drink                    0.018946
tfidf_alcohol                  0.014629
liwc_ingestion                 0.013829
isolation_total                0.012952
substance_use_total            0.012034
sent_neu                       0.011648
sent_neg                       0.011510
coleman_liau_index             0.009752
tfidf_depress                  0.009702
suicidality_total              0.009169
sent_compound                  0.008465
wiener_sachtextformel          0.008017
flesch_reading_ease            0.007915
liwc_death                     0.007733
lix                            0.007461
automated_readability_index    0.007293
sent_pos                       0.007254
liwc_social_processes          0.006921
flesch_kincaid_grade_level     0.006882
