In [1]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
df  = pd.read_csv('Truth_Seeker_Model_Dataset.csv')

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

In [5]:
df['preprocessed_text'] = df['statement'].apply(preprocess_text)

## sentiment scoring

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [7]:
df['sentiment_score'] = df['preprocessed_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentiment_label'] = df['sentiment_score'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

In [8]:
df['sentiment_label'].unique()

array(['Negative', 'Positive', 'Neutral'], dtype=object)

In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
df['tweet'] = df['tweet'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Padding sequences
max_len = max(df['preprocessed_text'].apply(len))
df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: x + [0] * (max_len - len(x)))



In [11]:
# categorical variables to numerical representations
df = pd.get_dummies(df, columns=['author', 'sentiment_label', '5_label_majority_answer'])


In [12]:
# train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# PyTorch tensors
train_inputs = torch.tensor(train_df['preprocessed_text'].tolist())
train_labels = torch.tensor(train_df['BinaryNumTarget'].tolist())
test_inputs = torch.tensor(test_df['preprocessed_text'].tolist())
test_labels = torch.tensor(test_df['BinaryNumTarget'].tolist())


In [13]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader


In [14]:
 #one hot encoding
num_classes = 2
train_labels = torch.eye(num_classes)[train_labels.long()]
test_labels = torch.eye(num_classes)[test_labels.long()]

# DataLoader
train_dataset = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [15]:
num_labels = len(df['BinaryNumTarget'].unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


In [19]:
epochs = 5
criterion = torch.nn.BCEWithLogitsLoss()  
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch

        optimizer.zero_grad()

        outputs = model(inputs)
        logits = outputs.logits.squeeze(1)
        labels = labels.squeeze(1)

        loss = criterion(logits, labels.float())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}')

Epoch 1/5 - Loss: 0.0023
Epoch 2/5 - Loss: 0.0008
Epoch 3/5 - Loss: 0.0013
Epoch 4/5 - Loss: 0.0009
Epoch 5/5 - Loss: 0.0004


In [23]:
model.eval()

test_dataset = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs, labels = batch

        outputs = model(inputs)
        logits = outputs.logits.squeeze(1)
        predicted_labels = torch.round(torch.sigmoid(logits))

        predictions.extend(predicted_labels.tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

In [24]:

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9999254843517139
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13076
           1       1.00      1.00      1.00     13764

   micro avg       1.00      1.00      1.00     26840
   macro avg       1.00      1.00      1.00     26840
weighted avg       1.00      1.00      1.00     26840
 samples avg       1.00      1.00      1.00     26840



## Model returns high accuracy and evaluated by confusion matrix. Overall, the model performs well

In [29]:
from sklearn.metrics import confusion_matrix
import numpy as np

In [30]:
true_labels1 = np.argmax(true_labels, axis=1)
predictions1 = np.argmax(predictions, axis=1)

# Calculate confusion matrix
cm = confusion_matrix(true_labels1, predictions1)
print(cm)

[[13076     0]
 [    2 13762]]


In [31]:
from transformers import BertForSequenceClassification

In [32]:
model_dir = 'BERT'
model.save_pretrained(model_dir)