##### Step 1: Loading the dataset

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/Constraint_Train.csv")



# If you want to see the first few real tweets
print(df.head(50))
print(len(df))

##### Step 2: Preprocess the text


In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
inputs = tokenizer(
    list(df['tweet']),
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=128
)

##### Step 3: Prepare the labels

In [3]:
print(df['label'].unique())
print(df['label'].head(10))
print(df['label'].value_counts(dropna=False))


['real' 'fake']
0    real
1    real
2    fake
3    real
4    real
5    real
6    real
7    fake
8    fake
9    fake
Name: label, dtype: object
label
real    3360
fake    3060
Name: count, dtype: int64


In [4]:
# Step 1: Map 'real'/'fake' to 1/0
df['label'] = df['label'].map({'fake': 0, 'real': 1})

# Step 2: Remove any NaNs that came from unmapped values
df = df.dropna(subset=['label'])

# Step 3: Convert labels to integers
df['label'] = df['label'].astype(int)

# Step 4: Check the result
print(df['label'].unique())


[1 0]


In [5]:
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch

labels = torch.tensor(df['label'].values, dtype=torch.long)

dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

##### Step 5: Training the model

In [6]:
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Assuming train_loader is already defined
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Assign higher weight to class 0 (fake), lower to class 1 (true)
# Adjust as needed (e.g., [2.0, 1.0] or [3.0, 1.0] for stronger penalty on false negatives)
class_weights = torch.tensor([5.0, 1.0]).to(device)
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

model.train()
for epoch in range(1):  # Adjust as needed
    print(f"Epoch {epoch + 1}")
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fct(logits, labels)  # manual loss with weight

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item():.4f}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
Loss: 0.6704
Loss: 0.7015
Loss: 0.6494
Loss: 0.6200
Loss: 0.6001
Loss: 0.5533
Loss: 0.5396
Loss: 0.5984
Loss: 0.4442
Loss: 0.6472
Loss: 0.3948
Loss: 0.4124
Loss: 0.6437
Loss: 0.5478
Loss: 0.4082
Loss: 0.3231
Loss: 0.4878
Loss: 0.4900
Loss: 0.5173
Loss: 0.2928
Loss: 0.3440
Loss: 0.3656
Loss: 0.3997
Loss: 0.3382
Loss: 0.5586
Loss: 0.4750
Loss: 0.3946
Loss: 0.3134
Loss: 0.3240
Loss: 0.3943
Loss: 0.2930
Loss: 0.5217
Loss: 0.3647
Loss: 0.5292
Loss: 0.4592
Loss: 0.2673
Loss: 0.3355
Loss: 0.3595
Loss: 0.3627
Loss: 0.3002
Loss: 0.2691
Loss: 0.3361
Loss: 0.1959
Loss: 0.4191
Loss: 0.1430
Loss: 0.3040
Loss: 0.4626
Loss: 0.7963
Loss: 0.3097
Loss: 0.2968
Loss: 0.1919
Loss: 0.1409
Loss: 0.3122
Loss: 0.2100
Loss: 0.2353
Loss: 0.1425
Loss: 0.1730
Loss: 0.1085
Loss: 0.1365
Loss: 0.1856
Loss: 0.2005
Loss: 0.0877
Loss: 0.2966
Loss: 0.1265
Loss: 0.1264
Loss: 0.3197
Loss: 0.1610
Loss: 0.0541
Loss: 0.3908
Loss: 0.1162
Loss: 0.2759
Loss: 0.6199
Loss: 0.1476
Loss: 0.0734
Loss: 0.2525
Loss: 0.3042
Loss

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

true_labels = []
predicted_labels = []

model.eval()
with torch.no_grad():
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predicted_labels.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='binary')
recall = recall_score(true_labels, predicted_labels, average='binary')
f1 = f1_score(true_labels, predicted_labels, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=["Fake", "True"]))

Accuracy: 0.9611
Precision: 0.9956
Recall: 0.9301
F1-Score: 0.9617

Classification Report:
              precision    recall  f1-score   support

        Fake       0.93      1.00      0.96      2434
        True       1.00      0.93      0.96      2702

    accuracy                           0.96      5136
   macro avg       0.96      0.96      0.96      5136
weighted avg       0.96      0.96      0.96      5136



##### Step 6: Predict on a new tweet

In [8]:
def predict(tweet):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return "True" if predicted_class == 1 else "False"


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

true_labels = []
predicted_labels = []

model.eval()
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predicted_labels.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='binary')
recall = recall_score(true_labels, predicted_labels, average='binary')
f1 = f1_score(true_labels, predicted_labels, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels, target_names=["Fake", "True"]))

Accuracy: 0.9424
Precision: 0.9867
Recall: 0.8997
F1-Score: 0.9412

Classification Report:
              precision    recall  f1-score   support

        Fake       0.90      0.99      0.94       626
        True       0.99      0.90      0.94       658

    accuracy                           0.94      1284
   macro avg       0.95      0.94      0.94      1284
weighted avg       0.95      0.94      0.94      1284



Predicting on a false tweet

In [10]:
example_tweet = "Breaking news: Chocolate can cure COVID!"
result = predict(example_tweet)
print(f"Prediction: {result}")


Prediction: False


Predicting on a factual tweet

In [11]:
factual_tweet = "The COVID-19 vaccine helps reduce the severity of symptoms and the risk of hospitalization."
result = predict(factual_tweet)
print(f"Prediction: {result}")


Prediction: False


### Interactive Gradio App

In [12]:
import gradio as gr

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=2, placeholder="Type your tweet here..."),
    outputs="text",
    title="Fake news Predictor",
    description="Enter a tweet and let the model classify it as factual or fake"
)

iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Missing file: C:\Users\elias\.cache\huggingface\gradio\frpc\frpc_windows_amd64_v0.3. 

Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: 

1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_windows_amd64.exe
2. Rename the downloaded file to: frpc_windows_amd64_v0.3
3. Move the file to this location: C:\Users\elias\.cache\huggingface\gradio\frpc




Created dataset file at: .gradio\flagged\dataset1.csv
