In [17]:
import pandas as pd
import re

In [18]:
# Load the dataset
data = pd.read_csv('./dataset.csv', encoding='ISO-8859-1')

# Clean the text data
def clean_text(text):
    # Remove anything that comes after "@"
    text = re.sub(r'@.*$', '', text)
    # Keep only letters, numbers, and grammatical marks
    text = re.sub(r'[^a-zA-Z0-9\s\.\?\!,]', '', text)
    return text

# Apply the clean_text function to the "text" column of the dataset
data['Content'] = data['Content'].apply(clean_text)

# Delete certain columns and the first row of data
columns_to_delete = ['ID', 'AuthorID', 'Author', 'Date', 'Words']
data = data.drop(columns=columns_to_delete)
# delete first row
data = data.iloc[1:]

for column in data.columns:
    if column != 'Content':
        data[column] = data[column].replace({'N': 0, 'Y': 1})
        
# Convert all non-"Contents" columns to integers
for column in data.columns:
    if column != 'Content':
        data[column] = pd.to_numeric(data[column], errors='coerce').astype('Int64')
        
# Save the cleaned dataset to a new CSV file
data.to_csv('./cleaned_dataset.csv', index=False)

In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer

In [28]:


# Load the dataset
df = pd.read_csv("./cleaned_dataset.csv")

# Drop rows with missing values
df.dropna(inplace=True)

# Convert labels to tensor
labels = tf.convert_to_tensor(df[['Change in appetite, losing or gaining weight', 'Sleeping too much or not sleeping well (insomnia)', 'Fatigue and low energy most days', 'Feeling worthless, guilty, and hopeless', 'An inability to focus and concentrate that may interfere with daily tasks at home, work, or school', 'Thinking about death and dying; suicidal ideation or suicide attempts', 'None', 'Movements that are unusually slow or agitated (a change which is often noticeable to others)']].values)

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=labels.shape[1])

# Compile the model for training
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metric = tf.keras.metrics.BinaryAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Tokenize input sentences and create attention masks
input_ids = []
attention_masks = []

for sentence in df['Content']:
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=387, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='tf')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert input_ids and attention_masks to tensors
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)

# Split the data into training and testing sets
input_ids_arr = np.asarray(input_ids)
attention_masks_arr = np.asarray(attention_masks)
labels_arr = np.asarray(labels)
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(input_ids_arr, attention_masks_arr, labels_arr, test_size=0.2, random_state=42)

# Train the model or load saved weights
checkpoint_file = 'model_weights.h5'
if tf.io.gfile.exists(checkpoint_file):
    # Load saved weights if the checkpoint file exists
    model.load_weights(checkpoint_file)
else:
    # Train the model and save the weights
    history = model.fit([train_input_ids, train_attention_masks], train_labels, epochs=5, batch_size=16, validation_data=([test_input_ids, test_attention_masks], test_labels))
    model.save_weights(checkpoint_file)



All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
 # Evaluate the model
test_loss, test_acc = model.evaluate([test_input_ids, test_attention_masks], test_labels, verbose=2)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

137/137 - 130s - loss: 0.2708 - accuracy: 0.9133 - 130s/epoch - 951ms/step
Test Loss: 0.27080944180488586
Test Accuracy: 0.9132944941520691




AxisError: ignored