<a href="https://colab.research.google.com/github/RishabhNegi1/Research-Fake-News-/blob/main/Non_opti_D4_ALBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets scikit-learn pandas torch emoji==0.6.0

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

# Load the two datasets assuming the CSV files are extracted and named 'Fake.csv' and 'True.csv'
df_fake = pd.read_csv('Fake.csv')
df_real = pd.read_csv('True.csv')

# Add a 'label' column to each dataset
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Combine the datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle the dataset to mix fake and real news
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# Optional: Save the combined dataset to a new CSV file
df_combined.to_csv('combined_news.csv', index=False)

# Display the first few rows to understand the structure
print(df_combined.head())

# Inspect the column names to identify the text and label columns
print(df_combined.columns)

# Update these variables with the actual column names in your dataset
text_column = 'text'  # Assuming the text column is named 'text'
label_column = 'label'  # The correct column name for the labels

# Encode the labels as integers
label_encoder = LabelEncoder()
df_combined[label_column] = label_encoder.fit_transform(df_combined[label_column])

# Verify the encoding (optional)
print("Encoded labels:", label_encoder.classes_)

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load ALBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(df_combined[label_column].unique()))

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined[text_column].values,
    df_combined[label_column].values,
    test_size=0.2,
    random_state=42
)

# Define max token length
MAX_LEN = 128

# Create train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, MAX_LEN)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Predictions for validation data
predictions = trainer.predict(val_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# Convert label classes to strings (necessary for classification_report)
target_names = [str(label) for label in label_encoder.classes_]

# Calculate and print detailed classification metrics
accuracy = accuracy_score(val_labels, preds)
conf_matrix = confusion_matrix(val_labels, preds)
class_report = classification_report(val_labels, preds, target_names=target_names)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Optional: Save the label encoder for later use
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)


                                               title  \
0  KING OBAMA Just Proclaimed The Month Of June W...   
1  Merkel says fight against terrorism no excuse ...   
2  Cabinet members lobby Trump to remove Iraq fro...   
3  French arrest nine, Swiss one in joint anti-te...   
4   Hateful Pastor: Kill Girl Scout Leaders Becau...   

                                                text       subject  \
0  So let it be said so let it be done.I don t re...      politics   
1  BERLIN (Reuters) - German Chancellor Angela Me...  politicsNews   
2  WASHINGTON (Reuters) - Senior Cabinet members ...  politicsNews   
3  PARIS (Reuters) - French police arrested nine ...     worldnews   
4  Gay-hating, Ted Cruz-loving Pastor Kevin Swans...          News   

                date  label  
0       May 31, 2016      0  
1  January 29, 2017       1  
2     March 2, 2017       1  
3  November 7, 2017       1  
4     March 17, 2016      0  
Index(['title', 'text', 'subject', 'date', 'label'], dtype='ob

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0027,0.023725
2,0.0446,0.013112
3,0.001,0.015644


Accuracy: 0.9978841870824053
Confusion Matrix:
 [[4697    8]
 [  11 4264]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4705
           1       1.00      1.00      1.00      4275

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

