In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
from glob import glob

In [25]:
def load_pheme5_original(main_folder):
    data = []

    # Traverse event folders
    for event_name in os.listdir(main_folder):
        event_path = os.path.join(main_folder, event_name)
        if not os.path.isdir(event_path):
            continue

        for label_type in ['rumours', 'non-rumours']:
            label_path = os.path.join(event_path, label_type)
            if not os.path.isdir(label_path):
                continue

            for tweet_folder in os.listdir(label_path):
                tweet_dir = os.path.join(label_path, tweet_folder, 'source-tweet')
                if not os.path.exists(tweet_dir):
                    continue

                # Load JSON file in the source-tweet folder
                for json_file in glob(os.path.join(tweet_dir, '*.json')):
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            tweet = json.load(f)

                        user = tweet.get('user', {})
                        data.append({
                            'id': tweet.get('id'),
                            'text': tweet.get('text'),
                            'created_at': tweet.get('created_at'),
                            'label': label_type,
                            'followers_count': user.get('followers_count'),
                            'friends_count': user.get('friends_count'),
                            'verified': user.get('verified'),
                            'retweet_count': tweet.get('retweet_count'),
                            'favorite_count': tweet.get('favorite_count')
                        })
                    except Exception as e:
                        print(f"Error reading {json_file}: {e}")

    df = pd.DataFrame(data)
    return df

In [26]:
pheme_df = load_pheme5_original(r'D:\text datasets\text datasets\phemernrdataset\pheme-rnr-dataset')
pheme_df.head()

Unnamed: 0,id,text,created_at,label,followers_count,friends_count,verified,retweet_count,favorite_count
0,552783238415265792,"Breaking: At least 10 dead, 5 injured after tO...",Wed Jan 07 11:06:08 +0000 2015,rumours,1628,246,False,159,14
1,552783667052167168,France: 10 people dead after shooting at HQ of...,Wed Jan 07 11:07:51 +0000 2015,rumours,129573,337,True,486,38
2,552783745565347840,Ten killed in shooting at headquarters of Fren...,Wed Jan 07 11:08:09 +0000 2015,rumours,529882,3051,True,127,15
3,552784168849907712,BREAKING: 10 dead in shooting at headquarters ...,Wed Jan 07 11:09:50 +0000 2015,rumours,499741,31,True,105,15
4,552784526955806720,Reuters: 10 people shot dead at headquarters o...,Wed Jan 07 11:11:16 +0000 2015,rumours,1377384,6,True,412,32


# Preprocessing

In [27]:
import re
import string
from nltk.corpus import stopwords

In [28]:
#PREPROCESSING USING REGULAR EXPRESSION
def clean_texts(text):
    text = re.sub(r"that’s","that is",text)
    text = re.sub(r"there’s","there is",text)
    text = re.sub(r"what’s","what is",text)
    text = re.sub(r"where’s","where is",text)
    text = re.sub(r"it’s","it is",text)
    text = re.sub(r"who’s","who is",text)
    text = re.sub(r"i’m","i am",text)
    text = re.sub(r"she’s","she is",text)
    text = re.sub(r"he’s","he is",text)
    text = re.sub(r"they’re","they are",text)
    text = re.sub(r"who're","who are",text)
    text = re.sub(r"ain’t","am not",text)
    text = re.sub(r"wouldn’t","would not",text)
    text = re.sub(r"shouldn’t","should not",text)
    text = re.sub(r"can’t","can not",text)
    text = re.sub(r"couldn’t","could not",text)
    text = re.sub(r"won’t","will not",text)
    text = re.sub(r"didn’t","did not",text)
    text = re.sub(r"doesn’t","does not",text)
    # remove URL
    text = re.sub(r"http\S+", " ", text)
#     # Remove usernames
    text = re.sub(r"@[^\s]+[\s]?",' ',text)
#     # remove special characters 
    text = re.sub('[^ a-zA-Z0-9]', ' ', text)
#     # remove Numbers
    text = re.sub('[0-9]', '', text)
    text = re.sub(' +', ' ',text)
    text = text.encode('ascii', 'ignore').decode()
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'\'\w+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub(r'\w*\d+\w*', '', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text

In [29]:
pheme_df['text'] = pheme_df['text'].apply(clean_texts)

In [30]:
stops = set(stopwords.words("english")) 
pheme_df['text']=pheme_df['text'].str.lower()
pheme_df['text']=pheme_df['text'].apply(lambda x:' '.join([word for word in x.split() if word.lower() not in (stops)]))

In [31]:
pheme_df['label'] = pheme_df['label'].map({'non-rumours': 0, 'rumours': 1})

# BERT

In [33]:
from transformers import BertTokenizer
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [34]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [35]:
# Text tokenization function
def tokenize_text(text_list):
    return tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

In [36]:
# Normalize user features
user_features = ['followers_count', 'friends_count', 'verified', 'retweet_count', 'favorite_count']
pheme_df[user_features] = pheme_df[user_features].fillna(0)
pheme_df['verified'] = pheme_df['verified'].astype(int) 

In [37]:
scaler = StandardScaler()
pheme_df[user_features] = scaler.fit_transform(pheme_df[user_features])

In [38]:
train_df, test_df = train_test_split(pheme_df, test_size=0.2, random_state=42, stratify=pheme_df['label'])

In [45]:
import torch.nn as nn
from transformers import BertModel
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import torch.nn.functional as F

In [40]:
class RumorDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.user_feats = dataframe[user_features].values
        self.encodings = tokenize_text(self.texts)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['user_feats'] = torch.tensor(self.user_feats[idx], dtype=torch.float)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RumorDataset(train_df)
test_dataset = RumorDataset(test_df)

In [41]:
class BertWithUserFeatures(nn.Module):
    def __init__(self):
        super(BertWithUserFeatures, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.user_fc = nn.Linear(len(user_features), 32)
        self.classifier = nn.Linear(self.bert.config.hidden_size + 32, 2)

    def forward(self, input_ids, attention_mask, user_feats):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        user_emb = self.user_fc(user_feats)
        combined = torch.cat((cls_output, user_emb), dim=1)
        combined = self.dropout(combined)
        logits = self.classifier(combined)
        return logits

In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertWithUserFeatures().to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)



In [43]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
# Training loop
for epoch in range(3): 
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        user_feats = batch['user_feats'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, user_feats)

        loss = nn.CrossEntropyLoss()(outputs, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Collect predictions for accuracy
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

        # Update progress bar
        loop.set_postfix(loss=loss.item())

    # Epoch stats
    avg_loss = total_loss / len(train_loader)
    epoch_acc = accuracy_score(all_labels, all_preds)

    print(f"\nEpoch {epoch+1} Loss: {avg_loss:.4f} Training Accuracy: {epoch_acc:.4f}\n")


Epoch 1: 100%|██████████████████████████████████████████████████████████| 291/291 [09:03<00:00,  1.87s/it, loss=0.0417]



Epoch 1 Loss: 0.0775 Training Accuracy: 0.9754



Epoch 2: 100%|█████████████████████████████████████████████████████████| 291/291 [10:11<00:00,  2.10s/it, loss=0.00578]



Epoch 2 Loss: 0.0451 Training Accuracy: 0.9847



Epoch 3:  32%|██████████████████▋                                       | 94/291 [03:17<06:08,  1.87s/it, loss=0.00237]

In [None]:
# Evaluation
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        user_feats = batch['user_feats'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, user_feats)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

In [None]:
# Accuracy
acc = accuracy_score(all_labels, all_preds)
print("Test Accuracy:", acc)

# Classification Report
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["non-rumor", "rumor"]))