##### Step 1: Loading the dataset

In [13]:
import device
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/Constraint_Train.csv")



# If you want to see the first few real tweets
print(df.head(50))
print(len(df))

    id                                              tweet label
0    1  The CDC currently reports 99031 deaths. In gen...  real
1    2  States reported 1121 deaths a small rise from ...  real
2    3  Politically Correct Woman (Almost) Uses Pandem...  fake
3    4  #IndiaFightsCorona: We have 1524 #COVID testin...  real
4    5  Populous states can generate large case counts...  real
5    6  Covid Act Now found "on average each person in...  real
6    7  If you tested positive for #COVID19 and have n...  real
7    8  Obama Calls Trump’s Coronavirus Response A Cha...  fake
8    9  ???Clearly, the Obama administration did not l...  fake
9   10  Retraction—Hydroxychloroquine or chloroquine w...  fake
10  11  Take simple daily precautions to help prevent ...  real
11  12  The NBA is poised to restart this month. In Ma...  fake
12  13  We just announced that the first participants ...  real
13  14  #CoronaVirusUpdates #IndiaFightsCorona More th...  real
14  15  Protect yourself and others from

##### Step 2: Preprocess the text


In [14]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
inputs = tokenizer(
    list(df['tweet']),
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=128
)

##### Step 3: Prepare the labels

In [15]:
print(df['label'].unique())
print(df['label'].head(10))
print(df['label'].value_counts(dropna=False))


['real' 'fake']
0    real
1    real
2    fake
3    real
4    real
5    real
6    real
7    fake
8    fake
9    fake
Name: label, dtype: object
label
real    3360
fake    3060
Name: count, dtype: int64


In [16]:
# Step 1: Map 'real'/'fake' to 1/0
df['label'] = df['label'].map({'fake': 0, 'real': 1})

# Step 2: Remove any NaNs that came from unmapped values
df = df.dropna(subset=['label'])

# Step 3: Convert labels to integers
df['label'] = df['label'].astype(int)

# Step 4: Check the result
print(df['label'].unique())


[1 0]


In [18]:
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch

labels = torch.tensor(df['label'].values)

dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

##### Step 5: Training the model

In [23]:
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Assuming tokenizer, train_loader, etc. are defined
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fct = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(6):  # Adjust epochs as needed
    print(f"Epoch {epoch + 1}")
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        # Forward pass WITH labels so model computes the loss automatically
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # This is better than doing it manually with loss_fct

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Loss: {loss.item():.4f}")



Loss: 0.6710
Loss: 0.6666
Loss: 0.6775


KeyboardInterrupt: 

##### Step 6: Predict on a new tweet

In [25]:
def predict(tweet):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return "True" if predicted_class == 1 else "False"


Predicting on a false tweet

In [26]:
example_tweet = "Breaking news: Chocolate can cure COVID!"
result = predict(example_tweet)
print(f"Prediction: {result}")


Prediction: False


Predicting on a factual tweet

In [27]:
factual_tweet = "The COVID-19 vaccine helps reduce the severity of symptoms and the risk of hospitalization."
result = predict(factual_tweet)
print(f"Prediction: {result}")


Prediction: False


In [None]:
!pip install gradio
python
Copy
Edit


### Interactive Gradio App

In [None]:
import gradio as gr

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=2, placeholder="Type your tweet here..."),
    outputs="text",
    title="Fake news Predictor",
    description="Enter a tweet and let the model classify it as factual or fake"
)

iface.launch(share=True)