In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# Import the tfidf vectorizer to quantify the significance of words within and across documents
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the PassiveAggressiveClassifier as the model to classify news later
from sklearn.linear_model import PassiveAggressiveClassifier

# We will also import cross_val_score to validate the model by cross-checking beyond the confusion matrix
from sklearn.model_selection import cross_val_score

In [4]:
# ls for file-name references
!ls

First Iteration Modelling.ipynb eng_news_clean.csv
complete_news_clean.csv         ger_news_clean.csv


### Model with English articles only

In [5]:
english_news = pd.read_csv('eng_news_clean.csv')

In [6]:
english_news.head()

Unnamed: 0,title,content,label
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images On Sunday morning...,1
1,Linklater's war veteran comedy speaks modern A...,"LONDON (Reuters) - “Last Flag Flying”, comedy-...",1
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke public view last week Mr. Corke...,1
3,Egypt's Cheiron wins tie-up Pemex Mexican onsh...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,Jason Aldean opens 'SNL' Vegas tribute,"Country singer Jason Aldean, performing Las Ve...",1


In [10]:
label_converter = {1:'Real', 0:'Fake'}

english_news = english_news.replace({'label' : label_converter})

In [37]:
X = english_news['content']

In [38]:
X.shape

(3988,)

In [26]:
y = english_news['label']

In [27]:
y.shape

(3988,)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

In [106]:
# We are not selecting stop words, since those have been cleaned out already

tfidf = TfidfVectorizer()

In [40]:
# Fit the data into the model to learn the vocabulary

tfidf_train =tfidf.fit_transform(X_train) 
tfidf_test =tfidf.transform(X_test)

In [41]:
pac = PassiveAggressiveClassifier(max_iter = 50)

pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [42]:
y_pred=pac.predict(tfidf_test)

score=accuracy_score(y_test,y_pred)

print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.37%


In [44]:
confusion_matrix(y_test,y_pred, labels=['Fake','Real'])

array([[419,   3],
       [  2, 374]])

### Model with German articles only

In [95]:
german_news = pd.read_csv('ger_news_clean.csv')

In [107]:
german_news.head()

Unnamed: 0,titles,content,label
0,Corona-Pandemie: Christine Lambrecht (SPD) spr...,"Bundesinnenminister Horst Seehofer (CSU), Bund...",Real
1,"Unwetter Sachsen, Bayern Berlin: 67-Jähriger s...",Überflutete Straße bayerischen Penzberg Starkr...,Real
2,Olympia 2021: Triathlon Männer beginnt kuriose...,Panne Olympia: Teil Triathleten wurde Boot bei...,Real
3,Galeria Karstadt Kaufhof kündigt Neustart einh...,Filiale Galeria Kaufhof Köln Reihenweise Filia...,Real
4,Olympia heute – Tag drei: Nacht geschah Tag br...,Sideris Tasiadis peilt Medaille Fehlstart beim...,Real


In [108]:
label_converter = {True:'Real', False:'Fake'}

In [109]:
german_news = german_news.replace({'label' : label_converter})

In [110]:
german_news.head()

Unnamed: 0,titles,content,label
0,Corona-Pandemie: Christine Lambrecht (SPD) spr...,"Bundesinnenminister Horst Seehofer (CSU), Bund...",Real
1,"Unwetter Sachsen, Bayern Berlin: 67-Jähriger s...",Überflutete Straße bayerischen Penzberg Starkr...,Real
2,Olympia 2021: Triathlon Männer beginnt kuriose...,Panne Olympia: Teil Triathleten wurde Boot bei...,Real
3,Galeria Karstadt Kaufhof kündigt Neustart einh...,Filiale Galeria Kaufhof Köln Reihenweise Filia...,Real
4,Olympia heute – Tag drei: Nacht geschah Tag br...,Sideris Tasiadis peilt Medaille Fehlstart beim...,Real


In [111]:
X = german_news['content']

In [112]:
y = german_news['label']

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 40)

In [114]:
tfidf_train = tfidf.fit_transform(X_train) 
tfidf_test = tfidf.transform(X_test)

In [115]:
pac = PassiveAggressiveClassifier(max_iter = 50)

pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [116]:
y_pred = pac.predict(tfidf_test)

score = accuracy_score(y_test,y_pred)

print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.02%


In [117]:
confusion_matrix(y_test,y_pred, labels=['Fake','Real'])

array([[352,   2],
       [  4, 256]])