In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# Path to the folder containing CSV files
folder_path = "data/"

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [3]:
combined_df.drop(columns=['author', 'date', 'post'], inplace=True)
combined_df = combined_df[combined_df['subreddit'].isin(['addiction', 'adhd', 'alcoholism', 'anxiety', 'autism', 'bpd', 'depression', 'lonely', 'ptsd', 'schizophrenia', 'suicidewatch'])] 

In [4]:
from sklearn.model_selection import train_test_split

X = combined_df.drop('subreddit', axis=1)  # Features
y = combined_df['subreddit']  # Target


In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
# Step 2: Standard Scaling and Min-Max Scaling for numerical features
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
X_final = standard_scaler.fit_transform(X)

# pca = PCA(n_components=100)  # You can adjust the number of components as needed
# X_pca = pca.fit_transform(X_final)
# X_final = minmax_scaler.fit_transform(X)

In [6]:


X_train, X_test, y_train, y_test = train_test_split(X_final  , y, test_size=0.2, random_state=42)

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Encode the categorical target variable (y_train) using OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))

# Step 2: Flatten the encoded labels
y_train_encoded = y_train_encoded.argmax(axis=1)
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1)).argmax(axis=1)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import optuna
import numpy as np

def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    # max_depth = trial.suggest_int("max_depth", 19, 40)
    # min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    # min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.1, 0.5)

    # Initialize the Random Forest Classifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=None,
        criterion='entropy',
        # min_samples_split=min_samples_split,
        # min_samples_leaf=min_samples_leaf,
        class_weight='balanced_subsample',
        random_state=42
    )

    # Fit the classifier to the training data
    clf.fit(X_train, y_train_encoded)

    # Make predictions on the validation data
    y_test_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_encoded, y_test_pred)

    return accuracy

# Create an Optuna study for optimization
study = optuna.create_study(direction="maximize")

# Optimize the objective function
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters from the study
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Initialize the Random Forest Classifier with the best hyperparameters
best_clf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    # min_samples_split=best_params["min_samples_split"],
    # min_samples_leaf=best_params["min_samples_leaf"],
    class_weight='balanced_subsample',
    random_state=42
)


[I 2023-10-30 09:45:02,423] A new study created in memory with name: no-name-304cf812-c4c1-4138-9210-3963e10614d0
[I 2023-10-30 09:55:12,393] Trial 0 finished with value: 0.6536448598130841 and parameters: {'n_estimators': 152}. Best is trial 0 with value: 0.6536448598130841.
[I 2023-10-30 10:04:02,760] Trial 1 finished with value: 0.6536181575433911 and parameters: {'n_estimators': 158}. Best is trial 0 with value: 0.6536448598130841.
[I 2023-10-30 10:08:20,622] Trial 2 finished with value: 0.6528704939919893 and parameters: {'n_estimators': 108}. Best is trial 0 with value: 0.6536448598130841.
[I 2023-10-30 10:11:30,963] Trial 3 finished with value: 0.6504939919893191 and parameters: {'n_estimators': 80}. Best is trial 0 with value: 0.6536448598130841.
[I 2023-10-30 10:17:23,451] Trial 4 finished with value: 0.6532843791722296 and parameters: {'n_estimators': 149}. Best is trial 0 with value: 0.6536448598130841.
[I 2023-10-30 10:20:44,628] Trial 5 finished with value: 0.6510146862483

In [None]:

# Fit the best classifier to the entire training dataset
best_clf.fit(X_train, y_train_encoded)

# Make predictions on the test data
y_test_pred = best_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_test_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Generate a classification report on the test data
class_report = classification_report(y_test_encoded, y_test_pred, zero_division=0)
print("Test Classification Report:\n", class_report)


Test Accuracy: 0.55
Test Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.67      0.60      1577
           1       0.84      0.64      0.73      9182
           2       0.48      0.83      0.61      1254
           3       0.75      0.72      0.74     11419
           4       0.17      0.65      0.27      1754
           5       0.79      0.49      0.60      4899
           6       0.75      0.34      0.47     23472
           7       0.31      0.71      0.43      4724
           8       0.60      0.61      0.61      1766
           9       0.20      0.33      0.25      1788
          10       0.52      0.67      0.58     13065

    accuracy                           0.55     74900
   macro avg       0.54      0.61      0.54     74900
weighted avg       0.66      0.55      0.57     74900



In [None]:


# # Initializing the Random Forest classifier
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced_subsample', verbose=2)

# # Fitting the classifier to the training data
# rf_classifier.fit(X_train, y_train_encoded)

# # Making predictions on the test data
# predictions = rf_classifier.predict(X_test)

# # Calculating accuracy
# accuracy = accuracy_score(y_test_encoded, predictions)
# print("Accuracy:", accuracy)


In [None]:

# # Generating a classification report
# print("Classification Report:\n", classification_report(y_test_encoded, predictions))
