<a href="https://colab.research.google.com/github/PeteCrowley/MachineLearning/blob/main/FakeNewsDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas
import numpy as np
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf


# Dataset from here: https://www.kaggle.com/c/fake-news/data?select=train.csv
df = pandas.read_csv("train.csv")
clean_df = df.dropna()

# Only using first 3000 values because my RAM can't handle more data
X = np.array(clean_df[df['id'] < 3000].text)
Y = np.array(clean_df[df['id'] < 3000].label)

# Splitting into train and test data
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, random_state=7)




In [None]:
# Changing the string text data into TfidfVectors which take numerical data on the text
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train = vectorizer.fit_transform(train_X)
tfidf_test = vectorizer.transform(test_X)



In [None]:
# A more lightweight model that runs quicker and is less RAM intensive
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, train_Y)

y_pred = model.predict(tfidf_test)
score = accuracy_score(test_Y, y_pred)
print(f'Accuracy: {round(score*100,2)}%')



Accuracy: 96.64%


In [None]:
# Prediction program using the lightweight model
text = [input("Enter Article: ")]
tfidf_text = vectorizer.transform(text)
choice = model.predict(tfidf_text)
if choice == 0:
  print("REAL")
else:
  print("FAKE")

Enter Article: LONDON (AP) — The World Health Organization said there was a 7% rise in new coronavirus cases across Europe last week, the only region in the world where cases increased, and said uneven vaccine uptake posed a threat to the continent.  In its weekly assessment of the pandemic, the U.N. health agency said there were about 2.7 million new COVID-19 cases and more than 46,000 deaths last week worldwide, similar to the numbers reported the previous week.  WHO said the two regions with the highest rates of COVID-19 incidence were Europe and the Americas. Globally, the U.S. reported the biggest number of new cases, more than 580,000, which still represented a 11% decline.  Britain, Russia and Turkey accounted for the most cases in Europe.  ADVERTISEMENT  The biggest drop in COVID-19 cases were seen in Africa and the Western Pacific, where infections fell by about 18% and 16%, respectively. The number of deaths in Africa also declined by about a quarter, despite the dire shortag

In [None]:
# Converts the vectors into dense objects which can be inputed into the Neural Network
tfidf_train_new = tfidf_train.todense()
tfidf_test_new = tfidf_test.todense()


# Setting Constants
EPOCHS = 10
NUM_CLASSES = 1
INPUT_SHAPE = (tfidf_train.shape[1], )


# Creating network with specific layers
model = Sequential()

model.add(Flatten(input_shape=INPUT_SHAPE))
model.add(Dropout(rate=0.2))
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(rate=0.2))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=NUM_CLASSES, activation='sigmoid'))

# Compiling and fitting the neural network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(tfidf_train_new, train_Y, epochs=EPOCHS, shuffle=True)

# Printing Info about the network's accuracy
scores = model.evaluate(tfidf_test_new, test_Y, verbose=0)
print('Test accuracy:', scores[1])
model.save("Fake_News_Model")

In [None]:
from keras.models import load_model

model = load_model('Fake_News_Model')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Prediction for a given article
text = [input("Enter Article: ")]

truth_score = model.predict(vectorizer.transform(text).todense())[0]

if truth_score > 0.5:
  print(f'There is a {truth_score[0]*100}% chance this is FAKE news!')
else:
  print(f'There is a {100 - truth_score[0]*100}% chance this is REAL NEWS!')

Enter Article: hello
There is a 90.25720357894897% chance this is FAKE news!
