In [1]:
import pandas as pd
import numpy as np
import os

In [3]:
# Path to the folder containing CSV files
folder_path = "data/"

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        # Read each CSV file into a DataFrame
        df = pd.read_csv(file_path)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
combined_df.drop(columns=['author', 'date', 'post'], inplace=True)

In [None]:
# from the combined_df, create a new dataframe in which the 'subreddit' column has only the values 'addiction', 'adhd', 'alcoholism', 'anxiety', 'autism', 'bpd', 'depression', 'lonely', 'ptsd', 'schizophrenia', 'suicidewatch'. Name this new dataframe as 'new_df'
combined_df = combined_df[combined_df['subreddit'].isin(['addiction', 'adhd', 'alcoholism', 'anxiety', 'autism', 'bpd', 'depression', 'lonely', 'ptsd', 'schizophrenia', 'suicidewatch'])] 

In [2]:


import pandas as pd
from sklearn.preprocessing import OneHotEncoder

combined_df = pd.read_csv('./combined_df.csv')

# Encode the 'subreddit' column using label encoding
label_encoder = OneHotEncoder()
combined_df['subreddit_encoded'] = label_encoder.fit_transform(combined_df['subreddit'])

# Calculate the correlation of each column with 'subreddit_encoded'
correlation_with_subreddit = {}
for column in combined_df.columns:
    if column != 'subreddit':
        correlation = combined_df['subreddit_encoded'].corr(combined_df[column])
        correlation_with_subreddit[column] = correlation

ValueError: Expected 2D array, got 1D array instead:
array=['adhd' 'adhd' 'adhd' ... 'addiction' 'addiction' 'addiction'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# Convert the correlation dictionary into a DataFrame
correlation_df = pd.DataFrame(list(correlation_with_subreddit.items()), columns=['Column', 'Correlation'])

# Sort the DataFrame by correlation values in descending order
correlation_df = correlation_df.sort_values(by='Correlation', ascending=False)

# print the correlation_df without truncation of rows
pd.set_option('display.max_rows', None)
print(correlation_df)


                          Column  Correlation
346            subreddit_encoded     1.000000
17                      sent_neg     0.257640
301                 tfidf_suicid     0.257270
206                   tfidf_kill     0.243008
145                    tfidf_die     0.204068
328                   tfidf_want     0.183635
109                 tfidf_anymor     0.173392
46                    liwc_death     0.172362
214                   tfidf_life     0.171300
217                   tfidf_live     0.168835
79                  liwc_sadness     0.147013
157                    tfidf_end     0.132902
26             suicidality_total     0.131850
177                   tfidf_fuck     0.128028
81                   liwc_sexual     0.127229
176                 tfidf_friend     0.127179
131                   tfidf_care     0.121360
166                 tfidf_famili     0.114730
22               isolation_total     0.110676
52                  liwc_friends     0.110039
84              liwc_swear_words  

In [8]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file
combined_df = pd.read_csv('./combined_df.csv')

In [10]:
combined_df.drop(columns=['post'], inplace=True)

In [12]:


import pandas as pd
from scipy.stats import f_oneway

# Calculate the Anova F-statistic for each feature
f_statistics = {}
for column in combined_df.columns:
    if column != 'subreddit':  # Skip the target variable
        groups = [combined_df[column][combined_df['subreddit'] == label] for label in combined_df['subreddit'].unique()]
        f_statistic, _ = f_oneway(*groups)
        f_statistics[column] = f_statistic

# Sort the F-statistics in descending order
sorted_f_statistics = dict(sorted(f_statistics.items(), key=lambda item: item[1], reverse=True))

# Print the sorted F-statistics
for feature, f_statistic in sorted_f_statistics.items():
    print(f'F-statistic between "subreddit" and "{feature}": {f_statistic:.2f}')



F-statistic between "subreddit" and "tfidf_adhd": 20799.19
F-statistic between "subreddit" and "tfidf_anxieti": 19598.15
F-statistic between "subreddit" and "tfidf_addict": 18865.69
F-statistic between "subreddit" and "tfidf_ptsd": 18011.20
F-statistic between "subreddit" and "tfidf_drink": 16796.50
F-statistic between "subreddit" and "tfidf_bpd": 16726.12
F-statistic between "subreddit" and "tfidf_alcohol": 12247.04
F-statistic between "subreddit" and "substance_use_total": 7558.22
F-statistic between "subreddit" and "sent_neu": 6330.74
F-statistic between "subreddit" and "liwc_ingestion": 6225.05
F-statistic between "subreddit" and "tfidf_depress": 5834.15
F-statistic between "subreddit" and "sent_neg": 5587.82
F-statistic between "subreddit" and "isolation_total": 4970.82
F-statistic between "subreddit" and "tfidf_attack": 4173.07
F-statistic between "subreddit" and "tfidf_suicid": 3982.79
F-statistic between "subreddit" and "tfidf_anxious": 3931.92
F-statistic between "subreddit" a

In [13]:
# Filter features with F-statistic > 500
selected_features = [feature for feature, f_statistic in f_statistics.items() if f_statistic > 500]

# Create a new DataFrame with selected features
combined_reduced_df = combined_df[selected_features]

# Save the new DataFrame to a CSV file
combined_reduced_df.to_csv('combined_reduced.csv', index=False)

In [14]:
# remove all columns which have their name starting with 'tfidf_'
combined_reduced_df = combined_reduced_df.loc[:, ~combined_reduced_df.columns.str.startswith('tfidf_')]
# Save the new DataFrame to a CSV file
combined_reduced_df.to_csv('combined_reduced.csv', index=False)

In [17]:
combined_reduced_df.shape

(374499, 30)

In [None]:
# print all unique values in the 'subreddit' column
print(combined_df['subreddit'].unique())

['addiction' 'adhd' 'alcoholism' 'anxiety' 'autism' 'bpd' 'depression'
 'lonely' 'ptsd' 'schizophrenia' 'suicidewatch']


In [None]:
# drop subreddit_encoded column from combined_df
combined_df.drop(columns=['subreddit_encoded'], inplace=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import optuna
import numpy as np

# drop the 'subreddit' column from combined_df and assign the result to X
X = combined_df.drop(columns=['subreddit'])

# assign the 'subreddit' column from combined_df to y
y = combined_df['subreddit']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the training set into training and validation sets (50% training, 50% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 5, 80)
    # min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    # min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.1, 0.5)

    # Initialize the Random Forest Classifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        criterion='entropy',
        # min_samples_split=min_samples_split,
        # min_samples_leaf=min_samples_leaf,
        class_weight='balanced_subsample',
        random_state=42
    )

    # Fit the classifier to the training data
    clf.fit(X_train, y_train)

    # Make predictions on the validation data
    y_val_pred = clf.predict(X_val)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)

    return accuracy

# Create an Optuna study for optimization
study = optuna.create_study(direction="maximize")

# Optimize the objective function
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters from the study
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Initialize the Random Forest Classifier with the best hyperparameters
best_clf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    # min_samples_split=best_params["min_samples_split"],
    # min_samples_leaf=best_params["min_samples_leaf"],
    class_weight='balanced_subsample',
    random_state=42
)


[32m[I 2023-10-24 19:48:50,781][0m A new study created in memory with name: no-name-1f871dc4-553b-46ba-aa20-c14486bfacf1[0m
[32m[I 2023-10-24 19:54:06,144][0m Trial 0 finished with value: 0.620316612626359 and parameters: {'n_estimators': 149, 'max_depth': 52}. Best is trial 0 with value: 0.620316612626359.[0m
[32m[I 2023-10-24 20:07:44,402][0m Trial 1 finished with value: 0.6231089071142476 and parameters: {'n_estimators': 390, 'max_depth': 80}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:10:22,142][0m Trial 2 finished with value: 0.6154491703223346 and parameters: {'n_estimators': 76, 'max_depth': 68}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:24:53,523][0m Trial 3 finished with value: 0.622513827961091 and parameters: {'n_estimators': 418, 'max_depth': 49}. Best is trial 1 with value: 0.6231089071142476.[0m
[32m[I 2023-10-24 20:32:19,695][0m Trial 4 finished with value: 0.6147930574098799 and parameters: {'n_

In [None]:
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6301290609701825


In [None]:
max_depth = clf.estimators_[0].tree_.max_depth
print(max_depth)

98


In [None]:

# Fit the best classifier to the entire training dataset
best_clf.fit(X_train, y_train)

# Make predictions on the test data
y_test_pred = best_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {accuracy:.2f}')

# Generate a classification report on the test data
class_report = classification_report(y_test, y_test_pred, zero_division=0)
print("Test Classification Report:\n", class_report)


Test Accuracy: 0.64
Test Classification Report:
                precision    recall  f1-score   support

    addiction       0.75      0.56      0.64      2270
         adhd       0.69      0.78      0.74     13753
   alcoholism       0.68      0.73      0.70      1737
      anxiety       0.80      0.72      0.76     17178
       autism       0.61      0.29      0.39      2634
          bpd       0.91      0.49      0.64      7187
   depression       0.55      0.70      0.62     35446
       lonely       0.58      0.47      0.52      6961
         ptsd       0.81      0.55      0.65      2631
schizophrenia       0.66      0.14      0.23      2673
 suicidewatch       0.59      0.58      0.59     19880

     accuracy                           0.64    112350
    macro avg       0.69      0.55      0.59    112350
 weighted avg       0.65      0.64      0.63    112350



In [None]:
# print clf.feature_importances_ in descending order of importance 
feature_importances = pd.DataFrame(best_clf.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                             importance
tfidf_ptsd                     0.033360
tfidf_addict                   0.030753
tfidf_adhd                     0.030692
tfidf_bpd                      0.029676
tfidf_anxieti                  0.025313
tfidf_drink                    0.018946
tfidf_alcohol                  0.014629
liwc_ingestion                 0.013829
isolation_total                0.012952
substance_use_total            0.012034
sent_neu                       0.011648
sent_neg                       0.011510
coleman_liau_index             0.009752
tfidf_depress                  0.009702
suicidality_total              0.009169
sent_compound                  0.008465
wiener_sachtextformel          0.008017
flesch_reading_ease            0.007915
liwc_death                     0.007733
lix                            0.007461
automated_readability_index    0.007293
sent_pos                       0.007254
liwc_social_processes          0.006921
flesch_kincaid_grade_level     0.006882


In [None]:
# print number of entries for each type of value in the 'subreddit' column
print(combined_df['subreddit'].value_counts())

depression       117331
suicidewatch      66161
anxiety           57671
adhd              45631
bpd               24294
lonely            23635
autism             8869
schizophrenia      8712
ptsd               8643
addiction          7641
alcoholism         5911
Name: subreddit, dtype: int64
