In [1]:
# ==============================================================================
# 1. Install Libraries
# ==============================================================================
# The transformers library is essential for loading the model and tokenizer.
# This command installs it if it's not already present in the environment.
!pip install transformers torch



In [2]:
# ==============================================================================
# 2. Import Libraries
# ==============================================================================
import torch
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive

In [3]:
# ==============================================================================
# 3. Mount Google Drive
# ==============================================================================
print("Mounting Google Drive...")
try:
    # This command will open an authentication window for you to connect your Drive.
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
except Exception as e:
    print(f"❌ Error mounting Google Drive: {e}")
    exit()

# Set device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully!
Using device: cpu


In [12]:
# ==============================================================================
# 4. Define Paths and Load Dataset
# ==============================================================================
# Define the paths to your files in Google Drive
import pandas as pd
import os

data_folder_path = '/content/drive/MyDrive/Fake_New_Classification'
dataset_path = os.path.join(data_folder_path, 'fake_real_news.csv')
model_save_path = os.path.join(data_folder_path, 'mbert_model_state.bin')

# Load the combined dataset
try:
    combined_df = pd.read_csv(dataset_path)
    print(f"\n✅ Dataset loaded successfully from: {dataset_path}")
    print(f"Dataset shape: {combined_df.shape}")
except FileNotFoundError:
    print(f"\n❌ Error: Dataset file not found at {dataset_path}.")
    print("Please check the path and make sure your dataset is in the correct location.")
    exit()


✅ Dataset loaded successfully from: /content/drive/MyDrive/Fake_New_Classification/fake_real_news.csv
Dataset shape: (1096, 5)


In [13]:
# ==============================================================================
# 5. Load the Saved Model and Tokenizer
# ==============================================================================
MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

try:
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)
    model.eval()
    print(f"✅ Model successfully loaded from: {model_save_path}")
except FileNotFoundError:
    print(f"❌ Error: Model file not found at {model_save_path}.")
    print("Please check the path and make sure you've saved the models correctly.")
    exit()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model successfully loaded from: /content/drive/MyDrive/Fake_New_Classification/mbert_model_state.bin


In [14]:
# ==============================================================================
# 6. Create Helper Functions
# ==============================================================================
def find_source_domain(headline, df):
    """
    Finds the source_domain for a given news headline in the DataFrame.
    Returns the domain if found, otherwise returns "Not Found in Dataset".
    """
    match = df[df['news'] == headline]
    if not match.empty:
        return match['source_domain'].iloc[0]
    return "Not Found in Dataset"

def classify_headline(text, model, tokenizer, max_len=128):
    """
    Classifies a single news headline as 'Real' (0) or 'Fake' (1).
    """
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1).flatten()
    _, prediction = torch.max(outputs.logits, dim=1)
    confidence_score = probabilities[prediction.item()].item()
    labels = ['Real', 'Fake']
    predicted_label = labels[prediction.item()]

    return predicted_label, confidence_score

In [15]:
# ==============================================================================
# 7. Test with a New Headline
# ==============================================================================
print("\n--- Example Prediction with Headline and Automatic Source Lookup ---")

# Provide a headline that does not exists in dataset. The script will find the source.
test_headline = "စစ်တပ်က ငြိမ်းချမ်းရေး လိုလားတဲ့အတွက် အပစ်ရပ်စဲကြောင်း ကြေညာ"

# Find the corresponding source domain from the loaded DataFrame
source_domain = find_source_domain(test_headline, combined_df)

# Classify the headline using the model
predicted_label, confidence = classify_headline(test_headline, model, tokenizer)

# Print the results
print(f"Headline: '{test_headline}'")
print(f"Source: {source_domain}")
print(f"Predicted Label: {predicted_label}")
print(f"Confidence Score: {confidence:.4f}")


--- Example Prediction with Headline and Automatic Source Lookup ---
Headline: 'စစ်တပ်က ငြိမ်းချမ်းရေး လိုလားတဲ့အတွက် အပစ်ရပ်စဲကြောင်း ကြေညာ'
Source: Not Found in Dataset
Predicted Label: Fake
Confidence Score: 0.9974


In [16]:
# ==============================================================================
# 7. Test with a Real New Headline from  Dataset
# ==============================================================================
print("\n--- Example Prediction with Headline and Automatic Source Lookup ---")

# Provide a headline that exists in dataset. The script will find the source.
test_headline = "ဗိုလ်ချုပ်အောင်ဆန်းသည် အမျိုးသားခေါင်းဆောင်ဖြစ်သည့်အတွက် နိုင်ငံသားတိုင်းပိုင်ဆိုင်ပြီး မဲဆွယ်ရာတွင် ဗိုလ်ချုပ်ပုံ သုံး မသုံးမှာ ပါတီများ၏ သဘောထားသာဖြစ်ဟု ပြည်ထောင်စုရွေးကောက်ပွဲကော်မရှင်ဥက္ကဋ္ဌ ပြောကြား"

# Find the corresponding source domain from the loaded DataFrame
source_domain = find_source_domain(test_headline, combined_df)

# Classify the headline using the model
predicted_label, confidence = classify_headline(test_headline, model, tokenizer)

# Print the results
print(f"Headline: '{test_headline}'")
print(f"Source: {source_domain}")
print(f"Predicted Label: {predicted_label}")
print(f"Confidence Score: {confidence:.4f}")


--- Example Prediction with Headline and Automatic Source Lookup ---
Headline: 'ဗိုလ်ချုပ်အောင်ဆန်းသည် အမျိုးသားခေါင်းဆောင်ဖြစ်သည့်အတွက် နိုင်ငံသားတိုင်းပိုင်ဆိုင်ပြီး မဲဆွယ်ရာတွင် ဗိုလ်ချုပ်ပုံ သုံး မသုံးမှာ ပါတီများ၏ သဘောထားသာဖြစ်ဟု ပြည်ထောင်စုရွေးကောက်ပွဲကော်မရှင်ဥက္ကဋ္ဌ ပြောကြား'
Source: news-eleven.com
Predicted Label: Real
Confidence Score: 0.9973


In [17]:
# ==============================================================================
# 7. Test with a Fake New Headline from  Dataset
# ==============================================================================
print("\n--- Example Prediction with Headline and Automatic Source Lookup ---")

# Provide a headline that exists in dataset. The script will find the source.
test_headline = "AI နဲ့ ဖန်တီးပြုလုပ်ထားတဲ့ ငလျင်ကြောင့်ပျက်စီးရုပ်သံဖိုင်များ"

# Find the corresponding source domain from the loaded DataFrame
source_domain = find_source_domain(test_headline, combined_df)

# Classify the headline using the model
predicted_label, confidence = classify_headline(test_headline, model, tokenizer)

# Print the results
print(f"Headline: '{test_headline}'")
print(f"Source: {source_domain}")
print(f"Predicted Label: {predicted_label}")
print(f"Confidence Score: {confidence:.4f}")


--- Example Prediction with Headline and Automatic Source Lookup ---
Headline: 'AI နဲ့ ဖန်တီးပြုလုပ်ထားတဲ့ ငလျင်ကြောင့်ပျက်စီးရုပ်သံဖိုင်များ'
Source: bur.mizzima.com
Predicted Label: Fake
Confidence Score: 0.9978
