In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv(r"D:\Soteria_NLP\Raw_files\train.tsv", sep ='\t' )
df_test = pd.read_csv(r"D:\Soteria_NLP\Raw_files\test.tsv", sep= '\t')
df_dev = pd.read_csv(r"D:\Soteria_NLP\Raw_files\dev.tsv", sep = '\t')

print(df_train.head())
print(df_test.head())
print(df_dev.head())

           PID                                          Text_data     Label
0  train_pid_1  Waiting for my mind to have a breakdown once t...  moderate
1  train_pid_2  My new years resolution : I'm gonna get my ass...  moderate
2  train_pid_3  New year : Somone else Feeling like 2020 will ...  moderate
3  train_pid_4  My story I guess : Hi, Im from Germany and my ...  moderate
4  train_pid_5  Sat in the dark and cried myself going into th...  moderate
          Pid                                          text data Class labels
0  test_pid_1  Im scared : This is it. I lie to myself every ...     moderate
1  test_pid_2  New to this but just wanted to vent : I just f...     moderate
2  test_pid_3  I’m sad : It’s kinda always been an issue. I w...     moderate
3  test_pid_4  Lonely but not alone. : All of my immediately ...     moderate
4  test_pid_5  This year has been trash. : I dont know why I’...     moderate
         PID                                          Text data     Label
0 

In [3]:
print(df_train.info())
print(df_test.info())
print(df_dev.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8891 entries, 0 to 8890
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        8891 non-null   object
 1   Text_data  8891 non-null   object
 2   Label      8891 non-null   object
dtypes: object(3)
memory usage: 208.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3245 entries, 0 to 3244
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Pid           3245 non-null   object
 1   text data     3245 non-null   object
 2   Class labels  3245 non-null   object
dtypes: object(3)
memory usage: 76.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4496 entries, 0 to 4495
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PID        4496 non-null   object
 1   Text data  4496 non-null   object
 2   Label      4496 non-nul

In [4]:
print(df_train['Label'].value_counts())
print(df_test['Class labels'].value_counts())
print(df_dev['Label'].value_counts())

moderate          6004
not depression    1985
severe             902
Name: Label, dtype: int64
moderate          2169
not depression     848
severe             228
Name: Class labels, dtype: int64
moderate          2306
not depression    1830
severe             360
Name: Label, dtype: int64


In [5]:
df2_train = df_train.copy()

# EXP-1

In [7]:
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove all the special characters
    text = re.sub(r'\W', ' ', str(text))

    # Remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    

    # Substitute multiple spaces with single space
    text = re.sub(r'\s+|\d+', ' ', text, flags=re.I)

    # Convert to lowercase
    text = text.lower()

    # Lemmatization
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    
    return text


df_train['Text_data'] = df_train['Text_data'].apply(preprocess_text)

In [8]:
df_clean_train = df_train.copy()

In [9]:
X = df_train['Text_data']
y= df_train['Label']

In [11]:
from sklearn.model_selection import train_test_split

# Splitting the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [12]:
y_train.value_counts()

moderate          4803
not depression    1588
severe             721
Name: Label, dtype: int64

### Data Augmentation

#### Synonym Replacement

In [16]:
import nltk
from nltk.corpus import wordnet
import random

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for lemma in syn.lemmas(): 
            synonyms.add(lemma.name())
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(words, n=5):
    words = words.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence

In [18]:
df_train = pd.concat([X_train, y_train], axis=1)

df_moderate = df_train[df_train['Label'] == 'moderate']
df_not_depression = df_train[df_train['Label'] == 'not depression']
df_severe = df_train[df_train['Label'] == 'severe']

df_not_depression_over = df_not_depression.sample(int(len(df_not_depression)*0.5), replace=True, random_state=42)
df_not_depression_over['Text_data'] = df_not_depression_over['Text_data'].apply(synonym_replacement)

df_severe_over = df_severe.sample(len(df_severe)*2, replace=True, random_state=42)
df_severe_over['Text_data'] = df_severe_over['Text_data'].apply(synonym_replacement)

df_moderate_under = df_moderate.sample(len(df_not_depression) + int(len(df_not_depression)*0.5), random_state=42)

df_train_aug = pd.concat([df_moderate_under, df_not_depression, df_not_depression_over, df_severe, df_severe_over])

X_train_aug = df_train_aug['Text_data']
y_train_aug = df_train_aug['Label']


In [20]:
y_train_aug.value_counts()

moderate          2382
not depression    2382
severe            2163
Name: Label, dtype: int64

In [21]:
duplicates = X_train_aug.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicates.sum()

# Print the number of duplicate rows
print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 2707


In [23]:
 #Assuming 'X_train_aug' and 'y_train_aug' are your DataFrames
duplicates = X_train_aug.duplicated(keep=False)  # Find all duplicate rows
duplicate_indices = duplicates[duplicates].index  # Get the indices of duplicate rows

# Filter the duplicate rows in 'X_train_aug' and their corresponding rows in 'y_train_aug'
duplicate_data = X_train_aug.loc[duplicate_indices]
duplicate_labels = y_train_aug.loc[duplicate_indices]

# Combine 'X_train_aug' and 'y_train_aug' for duplicate rows into a single DataFrame
duplicate_combined = pd.concat([duplicate_data, duplicate_labels], axis=1)

# Download the combined DataFrame as a CSV file
duplicate_combined.to_csv('duplicate_data.csv', index=False)

In [28]:
y_train.value_counts()

moderate          4803
not depression    1588
severe             721
Name: Label, dtype: int64

In [31]:
def convert_labels_to_scores(label):
    if label == 'not depression':
        return 0.25
    elif label == 'moderate':
        return 0.5
    elif label == 'severe':
        return 1.0

y_train = y_train.apply(convert_labels_to_scores)
y_train_aug = y_train_aug.apply(convert_labels_to_scores)
y_val = y_val.apply(convert_labels_to_scores)


## RNN 

In [35]:
lengths = [len(x.split()) for x in X_train]
avg_length = np.mean(lengths)
percentile_95_length = np.percentile(lengths, 95)
print(avg_length, percentile_95_length)

63.82156917885264 201.0


In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum number of words in the vocabulary and the maximum length for sequences
max_words = 10000
max_length = 200

# Instantiate the Tokenizer and fit it to the training data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert the texts to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_aug_sequences = tokenizer.texts_to_sequences(X_train_aug)
X_val_sequences = tokenizer.texts_to_sequences(X_val)

# Pad the sequences so they're all the same length
X_train_pad = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_train_aug_pad = pad_sequences(X_train_aug_sequences, maxlen=max_length, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_sequences, maxlen=max_length, padding='post', truncating='post')


#### Training with X_train_pad

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import EarlyStopping

model_rnn_X_train = Sequential()
model_rnn_X_train.add(Embedding(max_words, 32, input_length=max_length))
model_rnn_X_train.add(SimpleRNN(32, return_sequences=False))
model_rnn_X_train.add(Dense(1))

model_rnn_X_train.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_rnn_X_train.fit(X_train_pad, y_train, epochs=30, validation_data=(X_val_pad, y_val),
                                callbacks=[early_stopping])


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Get predicted values
y_val_predicted = model_rnn_X_train.predict(X_val_pad)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_val, y_val_predicted)
print('Mean Absolute Error:', mae)

# Calculate Mean Squared Error
mse = mean_squared_error(y_val, y_val_predicted)
print('Mean Squared Error:', mse)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

Mean Absolute Error: 0.11742201002713601
Mean Squared Error: 0.039425625282370834
Root Mean Squared Error: 0.19855887107447714


#### Training with X_train_aug_pad

In [42]:
model_rnn_X_train_aug = Sequential()
model_rnn_X_train_aug.add(Embedding(max_words, 32, input_length=max_length))
model_rnn_X_train_aug.add(SimpleRNN(32, return_sequences=False))
model_rnn_X_train_aug.add(Dense(1))

model_rnn_X_train_aug.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_rnn_X_train_aug.fit(X_train_aug_pad, y_train_aug, epochs=30, validation_data=(X_val_pad, y_val),
                                    callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30


In [49]:
# Get predicted values
y_val_predicted = model_rnn_X_train_aug.predict(X_val_pad)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_val, y_val_predicted)
print('Mean Absolute Error:', mae)

# Calculate Mean Squared Error
mse = mean_squared_error(y_val, y_val_predicted)
print('Mean Squared Error:', mse)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

Mean Absolute Error: 0.16204104187650717
Mean Squared Error: 0.044674313636210515
Root Mean Squared Error: 0.2113629902234791


## LSTM

In [50]:
from tensorflow.keras.layers import Embedding, LSTM, Dense

model_lstm_X_train = Sequential()
model_lstm_X_train.add(Embedding(max_words, 32, input_length=max_length))
model_lstm_X_train.add(LSTM(32, return_sequences=False))
model_lstm_X_train.add(Dense(1))

model_lstm_X_train.compile(optimizer='adam', loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_lstm_X_train.fit(X_train_pad, y_train_scores, epochs=30, validation_data=(X_val_pad, y_val_scores), callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30


In [51]:
# Get predicted values
y_val_predicted = model_lstm_X_train.predict(X_val_pad)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_val, y_val_predicted)
print('Mean Absolute Error:', mae)

# Calculate Mean Squared Error
mse = mean_squared_error(y_val, y_val_predicted)
print('Mean Squared Error:', mse)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print('Root Mean Squared Error:', rmse)

Mean Absolute Error: 0.10778830218442442
Mean Squared Error: 0.039004328921241116
Root Mean Squared Error: 0.19749513644958733


In [54]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

model_lstm_X_train = Sequential()
model_lstm_X_train.add(Embedding(max_words, 64, input_length=max_length))  # Increased embedding dimension
model_lstm_X_train.add(LSTM(64, return_sequences=True))  # Added more units and returning sequences
model_lstm_X_train.add(Dropout(0.2))  # Added dropout
model_lstm_X_train.add(LSTM(32, return_sequences=False))  # Added another LSTM layer
model_lstm_X_train.add(Dense(1))

model_lstm_X_train.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')  # Tuned learning rate

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_lstm_X_train.fit(X_train_pad, y_train_scores, epochs=30, validation_data=(X_val_pad, y_val_scores), batch_size=64, callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
