In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline


In [None]:
from wordcloud import WordCloud
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import spacy

import matplotlib.pyplot as plt
import seaborn as sns




In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 55)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
import pandas as pd
from datasets import load_dataset

# Завантаження датасету
dataset = load_dataset('mavinsao/reddit-mental-illness-82')
df = dataset['train'].to_pandas()

# Перетворення тексту на нижній регістр
df['text'] = df['text'].str.lower()

# Фільтрація рядків за наявністю ключових слів
keywords = [
    "think i have", "i think i have", "i think it might be", "i think i could have",
    "might have", "i might have", "might be", "feel like i have", "i feel like i have",
    "feels like i have", "self-diagnose", "self-diagnosed", "i've self-diagnosed",
    "unsure if i have", "i'm unsure if", "unsure if this is", "wonder if i have",
    "i wonder if i have", "wonder if it's", "symptoms of", "i have symptoms of",
    "experiencing symptoms of", "suspect i have", "i suspect i have", "i suspect it's",
    "probably have", "i probably have", "i think i probably have", "could be", "it could be",
    "seems like i have", "it seems like i have", "it seems like", "not diagnosed but",
    "i am not diagnosed but", "i haven't been diagnosed but"
]

df = df[~df['text'].str.contains('|'.join(keywords), case=False, na=False)]

# Розбиття тексту на заголовок і основний текст
def split_text(row):
    if ':' in row:
        parts = row.split(':', 1)
        return parts[0].strip(), parts[1].strip()
    return None, row.strip()

df[['title', 'main_text']] = df['text'].apply(split_text).apply(pd.Series)

# Перевірка результату
print(df[['title', 'main_text', 'label']])


Downloading readme:   0%|          | 0.00/755 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.04M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5264 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5265 [00:00<?, ? examples/s]

                                                   title  \
0                              fantasizing about your fp   
1      this is a support subreddit for people with a ...   
2                        scared of my psychotic symptoms   
4      i feel like a sick animal that needs to be put...   
5      is constantly checking ocd subreddits a compul...   
...                                                  ...   
42108  16m suicidal, lonely, need to feel loved befor...   
42109                               got asked on a date!   
42110         i just found out my friend killed herself.   
42111  family letter detailing the conditions and lib...   
42112                     i'm very proud of myself today   

                                               main_text  label  
0      do you do it? what do you fantasize about? : w...      3  
1      it's not for posting how infuriating the uneve...      6  
2      i'm trying to keep reminding myself that what ...      2  
4      i had a 

In [None]:
import pandas as pd
from nltk.corpus import stopwords
import spacy

# Функція очищення тексту
def clean_text(text):
    text = text.str.lower()
    text = text.str.replace(r'[^\w\s]', '', regex=True)
    text = text.str.replace("\n", '', regex=True)
    text = text.str.replace('\d', '', regex=True)
    text = text.str.replace(r'\[.*?\]', '', regex=True)
    text = text.str.replace(r'https?://\S+|www\.\S+', '', regex=True)
    text = text.str.replace(r'<.*?>+', '', regex=True)
    text = text.str.replace(r'\w*\d\w*', '', regex=True)
    return text

# Функція видалення стоп-слів
# custom_stopwords = {'ocd', 'anxiety', 'adhd', 'ptsd', 'bpd', 'depression', 'bipolar'}
custom_stopwords = {}
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stop_words.update(custom_stopwords)
    text = text.apply(lambda x: " ".join(word for word in str(x).split() if word.lower() not in stop_words))
    return text

# Функція лематизації
nlp = spacy.load('en_core_web_sm')

def lemmatize_sentence(sentence):
    doc = nlp(sentence)
    return " ".join([token.lemma_ for token in doc])

# Застосування функцій до обох стовпців
def process_text_columns(df, columns):
    for col in columns:
        df[col] = clean_text(df[col])
        df[col] = remove_stopwords(df[col])
        delete = pd.Series(' '.join(df[col]).split()).value_counts()[-1000:]
        df[col] = df[col].apply(lambda x: " ".join(word for word in x.split() if word.lower() not in delete))
        df[col] = df[col].apply(lemmatize_sentence)
    return df

df = pd.DataFrame(df)
df = process_text_columns(df, ['title', 'main_text'])
print(df)


                                                    text  label  \
0      fantasizing about your fp: do you do it? what ...      3   
1      this is a support subreddit for people with a ...      6   
2      scared of my psychotic symptoms : i'm trying t...      2   
4      i feel like a sick animal that needs to be put...      4   
5      is constantly checking ocd subreddits a compul...      6   
...                                                  ...    ...   
42108  16m suicidal, lonely, need to feel loved befor...      4   
42109  got asked on a date! : and by a girl i already...      1   
42110  i just found out my friend killed herself. : i...      3   
42111  family letter detailing the conditions and lib...      5   
42112  i'm very proud of myself today : i had an exam...      3   

                                                   title  \
0                                           fantasize fp   
1                support subreddit people mental illness   
2              

In [None]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split




# Об'єднання заголовку і основного тексту в один стовпець
df['text'] = df['title'].str.strip(':') + ' ' + df['main_text']
# df['text'] = df['main_text']
X = df['text']
y = df['label']

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_res, y_res = undersampler.fit_resample(X.values.reshape(-1, 1), y)

# Перевірка, чи вдалося збалансувати класи
print("Розподіл класів після балансування:")
print(pd.Series(y_res).value_counts())

balanced_df = pd.DataFrame({
    'text': X_res.flatten(),
    'label': y_res
})

# тренувальний та тестовий набори
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

print("\nРозподіл класів у тренувальному наборі:")
print(train_df['label'].value_counts())

print("\nРозподіл класів у тестовому наборі:")
print(test_df['label'].value_counts())


Розподіл класів після балансування:
label
0    3257
1    3257
2    3257
3    3257
4    3257
5    3257
6    3257
7    3257
Name: count, dtype: int64

Розподіл класів у тренувальному наборі:
label
5    2624
3    2622
7    2616
6    2615
0    2615
4    2604
1    2586
2    2562
Name: count, dtype: int64

Розподіл класів у тестовому наборі:
label
2    695
1    671
4    653
0    642
6    642
7    641
3    635
5    633
Name: count, dtype: int64


In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Трансформація тексту в TF-IDF вектори
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_df['text'])
X_test = tfidf_vectorizer.transform(test_df['text'])

# Перетворення текстових міток у числові значення
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

# гіперпараметри для перебору
grid_params = {
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1]
}


param_grid = list(ParameterGrid(grid_params))
best_accuracy = 0
best_params = None

for params in param_grid:
    print(f"Training with parameters: {params}")
    xgb_classifier = XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        **params
    )
    xgb_classifier.fit(X_train, y_train)

    y_test_pred = xgb_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)
    print(f"Accuracy: {accuracy}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("\nBest Parameters:", best_params)
print("Best Test Set Accuracy:", best_accuracy)


Training with parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.6841903300076746
Training with parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Accuracy: 0.695510360706063
Training with parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Accuracy: 0.7052954719877207
Training with parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100}
Accuracy: 0.7102839600920952
Training with parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200}
Accuracy: 0.7229470452801228
Training with parameters: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}
Accuracy: 0.7338833461243285
Training with parameters: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100}
Accuracy: 0.7262087490406753
Training with parameters: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 200}
Accuracy: 0.7352264006139677
Training with parameters: {'learning_rate': 0.01, 'max_depth': 9, 'n_esti

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_df['text'])
X_test = tfidf_vectorizer.transform(test_df['text'])

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

# XGBoost
xgb_classifier = XGBClassifier(
    max_depth=6,
    n_estimators=200,
    learning_rate=0.05,
    random_state=42,
    use_label_encoder=False
)
xgb_classifier.fit(X_train, y_train)

y_test_pred = xgb_classifier.predict(X_test)

print("Test Set Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(classification_report(y_test, y_test_pred))


Test Set Evaluation:
Accuracy: 0.7674597083653109
              precision    recall  f1-score   support

           0       0.83      0.78      0.80       642
           1       0.77      0.76      0.76       671
           2       0.82      0.65      0.73       695
           3       0.70      0.70      0.70       635
           4       0.58      0.75      0.65       653
           5       0.78      0.96      0.86       633
           6       0.91      0.79      0.85       642
           7       0.83      0.76      0.80       641

    accuracy                           0.77      5212
   macro avg       0.78      0.77      0.77      5212
weighted avg       0.78      0.77      0.77      5212



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from itertools import product

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['text'])

X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])
max_len = max(len(x) for x in X_train + X_test)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_test = label_encoder.transform(test_df['label'])

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

class ThreeLayerLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_units, output_dim, dropout_rate):
        super(ThreeLayerLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm1 = nn.LSTM(embed_dim, lstm_units, batch_first=True, dropout=dropout_rate, bidirectional=True)
        self.lstm2 = nn.LSTM(lstm_units * 2, lstm_units, batch_first=True, dropout=dropout_rate, bidirectional=True)
        self.lstm3 = nn.LSTM(lstm_units * 2, lstm_units, batch_first=True, dropout=dropout_rate, bidirectional=True)
        self.fc = nn.Linear(lstm_units * 2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x = self.dropout(x[:, -1, :])
        x = self.fc(x)
        return x

output_dim = len(np.unique(y_train))
param_grid = {
    'embed_dim': [50, 100],
    'lstm_units': [64, 100],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.0005]
}

param_combinations = list(product(
    param_grid['embed_dim'],
    param_grid['lstm_units'],
    param_grid['dropout_rate'],
    param_grid['learning_rate']
))

results = []

# Грід серч
for embed_dim, lstm_units, dropout_rate, learning_rate in param_combinations:
    print(f"Training with parameters: embed_dim={embed_dim}, lstm_units={lstm_units}, dropout_rate={dropout_rate}, learning_rate={learning_rate}")

    model = ThreeLayerLSTM(len(tokenizer.word_index) + 1, embed_dim, lstm_units, output_dim, dropout_rate).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    def train_model(model, train_loader, optimizer, criterion, device, epochs=50):
        model.train()
        history = {'loss': [], 'accuracy': []}
        for epoch in range(epochs):
            total_loss, correct = 0, 0
            for texts, labels in train_loader:
                texts, labels = texts.to(device), labels.to(device)

                optimizer.zero_grad()
                outputs = model(texts)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                correct += (outputs.argmax(1) == labels).sum().item()

            accuracy = correct / len(train_loader.dataset)
            history['loss'].append(total_loss / len(train_loader))
            history['accuracy'].append(accuracy)
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.4f}")
        return history

    history = train_model(model, train_loader, optimizer, criterion, device)

    def evaluate_model(model, test_loader, device):
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for texts, labels in test_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                preds = outputs.argmax(1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
        return all_preds, all_labels

    y_test_pred, y_test_labels = evaluate_model(model, test_loader, device)
    test_acc = accuracy_score(y_test_labels, y_test_pred)
    results.append({
        'embed_dim': embed_dim,
        'lstm_units': lstm_units,
        'dropout_rate': dropout_rate,
        'learning_rate': learning_rate,
        'train_loss': history['loss'][-1],
        'train_acc': history['accuracy'][-1],
        'test_acc': test_acc
    })

    print(f"Results: Train Loss: {history['loss'][-1]:.4f}, Train Accuracy: {history['accuracy'][-1]:.4f}, Test Accuracy: {test_acc:.4f}\n")

best_result = max(results, key=lambda x: x['test_acc'])
print("Best Parameters:", best_result)


Using device: cuda
Training with parameters: embed_dim=50, lstm_units=64, dropout_rate=0.2, learning_rate=0.001
Epoch 1/50, Loss: 1.9084, Accuracy: 0.2331
Epoch 2/50, Loss: 1.6566, Accuracy: 0.3553
Epoch 3/50, Loss: 1.5296, Accuracy: 0.4098
Epoch 4/50, Loss: 1.3733, Accuracy: 0.4749
Epoch 5/50, Loss: 1.2530, Accuracy: 0.5256
Epoch 6/50, Loss: 1.1864, Accuracy: 0.5632
Epoch 7/50, Loss: 1.0246, Accuracy: 0.6381
Epoch 8/50, Loss: 0.8471, Accuracy: 0.7129
Epoch 9/50, Loss: 0.7062, Accuracy: 0.7659
Epoch 10/50, Loss: 0.6210, Accuracy: 0.7960
Epoch 11/50, Loss: 0.5571, Accuracy: 0.8191
Epoch 12/50, Loss: 0.4951, Accuracy: 0.8405
Epoch 13/50, Loss: 0.4318, Accuracy: 0.8642
Epoch 14/50, Loss: 0.3667, Accuracy: 0.8867
Epoch 15/50, Loss: 0.3142, Accuracy: 0.9064
Epoch 16/50, Loss: 0.2648, Accuracy: 0.9217
Epoch 17/50, Loss: 0.2210, Accuracy: 0.9348
Epoch 18/50, Loss: 0.1825, Accuracy: 0.9490
Epoch 19/50, Loss: 0.1527, Accuracy: 0.9576
Epoch 20/50, Loss: 0.1299, Accuracy: 0.9646
Epoch 21/50, Loss