<a href="https://colab.research.google.com/github/TiredEspressoBean/FakeNewsDetector---AI/blob/main/FakeNewsDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install transformers[torch] accelerate

# Problems and Goals

**Problem 1**: Misinformation.

With the proliferation of misinformation, that is mistruths presented as facts, I wish to have a stronger understanding of how misinformation works on a larger systemic level.



**Problem 2**: Lack of personal knowledge about AI.

While understanding the principles of how different systems we know as AI operate, I wanted a more tangible understanding of How they operate through a little bit of practice with such systems.



**Goal**: Therefore with both of these problelms at hand why not go ahead and build an AI model that tries to detect fake news. This will let me explore more popular AI systems within the last few years, that being systems that evaluate linguistics and give me an understanding through the data accrued of how in text a machine would be able to detect misinformation without the use of fact checking, a more intensive process.

# Tools Used

Torch: Machine learning framework commonly used with Python for machine learning modeling.

Treansformers: Library providing the API for our model.

BERT: Bidirectional Encoder Representations from Transformers, a language model developed by google specifically for Natural Language Processing.

Pandas, numpy, and random are standard libraries for mathematics and the like.

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy
import random


# What is BERT?

In [None]:
model_name = "bert-base-uncased"
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:

def set_seed(seed: int):

    random.seed(seed)
    numpy.random.seed(seed)
    if is_torch_tpu_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

    if is_tf_available():
        import tensorflow as tf
        tf.random.set_seed(seed)


set_seed(1)

# Sanitizing the Data

In [None]:
news_dataset = pd.read_csv('news.csv', error_bad_lines=False)


In [None]:
columns_current = ['title', 'text', 'label']
remove_columns = ['drop']
features = []
target_column = ['label']
analysis_text = ['title', 'text']

In [None]:
def fix_label(df):
    df['label'] = df['label'].replace({"FAKE": 0, "REAL": 1})
    return df


def remove_unused_columns(df, columns_current=remove_columns):
    df = df.drop(columns_current, axis=1)
    return df


def null_process(feature_df):
    for col in target_column:
        feature_df.loc[feature_df[col].isnull(), col] = "None"
    return feature_df


def clean_dataset(df):
    df = remove_unused_columns(df)
    df = null_process(df)
    df = fix_label(df)
    return df

In [None]:
news_dataset = clean_dataset(news_dataset)

news_dataframe = news_dataset[news_dataset['text'].notna()]
news_dataframe = news_dataframe[news_dataframe["title"].notna()]

# Prepare Data for Training

In [None]:
def prepare_data(df, test_size=0.2):
    texts = []
    labels = []
    for i in range(len(df)):
        text = df["text"].iloc[i]
        label = df["label"].iloc[i]
        text = df["title"].iloc[i] + " - " + text
        if text and label in [0, 1]:
            texts.append(text)
            labels.append(label)
    return train_test_split(texts, labels, test_size=test_size)


train_texts, valid_texts, train_labels, valid_labels = prepare_data(news_dataframe)


train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

# News Object

In [None]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v, in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = NewsDataset(train_encodings, train_labels)
valid_dataset = NewsDataset(valid_encodings, valid_labels)

# Metrics Computation

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

# Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=20,
    warmup_steps=100,
    logging_dir='./logs',
    load_best_model_at_end=True,
    logging_steps=200,
    save_steps=200,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

#Run training

In [None]:
trainer.train()

trainer.evaluate()

# Save Model

In [None]:
model_path = "fake-news-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Make a Prediction

In [None]:
def get_prediction(text, convert_to_label=False):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    d = {
        0: "fake",
        1: "reliable"
    }
    if convert_to_label:
        return d[int(probs.argmax())]
    else:
        return int(probs.argmax())

In [None]:
real_news = """
Biden Administration Urges Justices to Hear Cases on Social Media Laws
The administration argued that the laws, enacted by Florida and Texas to prevent removal of posts amid conservative complaints about censorship by tech platforms, violated the First Amendment.
"""

get_prediction(real_news, convert_to_label=True)

'reliable'