In [1]:
#Fake News Detection Using Machine Learning

#Fake News - A type of yellow journalism, fake news encapsulates pieces of news that may be hoaxes and is generally spread through social media and other online media. This is often done to further or impose certain ideas and is often achieved with political agendas. Such news items may contain false and/or exaggerated claims, and may end up being viralized by algorithms, and users may end up in a filter bubble.

import numpy as np
import pandas as pd
import itertools

from sklearn.model_selection import train_test_split
#train_test_split - split arrays or matrices into random train and test subsets

from sklearn.feature_extraction.text import TfidfVectorizer
#TF (Term Frequency): The number of times a word appears in a document is its Term Frequency

#IDF (Inverse Document Frequency): Words that occur many times a document, but also occur many times in many others, may be irrelevant. IDF is a measure of how significant a term is in the entire corpus.

#TfidfVectorizer converts a collection of raw documents into a matrix of TF-IDF features

from sklearn.linear_model import PassiveAggressiveClassifier
#Passive Aggressive algorithms are online learning algorithms. Such an algorithm remains passive for a correct classification outcome, and turns aggressive in the event of a miscalculation, updating and adjusting. Unlike most other algorithms, it does not converge. Its purpose is to make updates that correct the loss, causing very little change in the norm of the weight vector.

from sklearn.metrics import accuracy_score, confusion_matrix

#reading the data
df = pd.read_csv('news.csv')

#getting shape and head
df.shape
df.head()

#getting the labels
labels=df.label
labels.head()

#Train/Test is a method to measure the accuracy of the model. It is called Train/Test because you split the the data set into two sets: a TRAINING set and a TESTING set. 80% for training, and 20% for testing. You train the model using the training set. You test the model using the testing set.

#Split the dataset
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=7)


#Stop words are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc. Such words are already captured this in corpus named corpus.

#intializing a TfidVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

#fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

#initializing a passiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

#Predicting on the test set and calculating accuracy
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#A true positive is an outcome where the model correctly predicts the positive class. Similarly, a true negative is an outcome where the model correctly predicts the negative class. A false positive is an outcome where the model incorrectly predicts the positive class.

confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

Accuracy: 92.9%


array([[592,  46],
       [ 44, 585]], dtype=int64)