In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Clean the english news data set

For the English bit of news we will use an [extensive data set by the University of Victoria](https://www.uvic.ca/engineering/ece/isot/datasets/fake-news/index.php), containing over 40,000 labelled news articles.

In [8]:
df1 = pd.read_csv('Input_Data/Fake.csv')

In [9]:
# we will only need a subset of the dataset, namely content and titles

df1 = df1[['title','text']]

In [10]:
df1['label'] = 'Fake'

In [11]:
df1['title'][1]

' Drunk Bragging Trump Staffer Started Russian Collusion Investigation'

In [12]:
# check for null values

df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   23481 non-null  object
 1   text    23481 non-null  object
 2   label   23481 non-null  object
dtypes: object(3)
memory usage: 550.5+ KB


In [13]:
# rename the content column
df1 = df1.rename({'text':'content'}, axis='columns')

Repeat the process for the .csv file containing all the articles labelled as true

In [14]:
df2 = pd.read_csv('Input_Data/True.csv')

In [15]:
df2 = df2[['title','text']]

In [16]:
df2['label'] = 'Real'

In [17]:
df2 = df2.rename({'text':'content'}, axis='columns')

In [18]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   content  21417 non-null  object
 2   label    21417 non-null  object
dtypes: object(3)
memory usage: 502.1+ KB


In [19]:
eng_news_complete =  pd.concat([df1, df2], join ='inner')

In [20]:
eng_news_complete = eng_news_complete.reset_index(drop=True)

In [21]:
#eng_news_complete = eng_news_complete[['content','label']]

In [22]:
eng_news_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   content  44898 non-null  object
 2   label    44898 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


### Concat, check, and apply final cleaning steps on the true german news

In [23]:
# Import all the news cvs

gerreal1 = pd.read_csv("Input_Data/real_ger_news_1.csv")
gerreal2 = pd.read_csv("Input_Data/real_ger_news_2.csv")
gerreal3 = pd.read_csv("Input_Data/real_ger_news_3.csv")
gerreal4 = pd.read_csv("Input_Data/real_ger_news_4.csv")
gerreal5 = pd.read_csv("Input_Data/real_ger_news_5.csv")

In [24]:
gerreal1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469 entries, 0 to 468
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   469 non-null    object
 1   content  469 non-null    object
 2   subject  469 non-null    object
 3   label    469 non-null    bool  
dtypes: bool(1), object(3)
memory usage: 11.6+ KB


In [25]:
# concat all news in a data set
ger_real_news_complete = pd.concat([gerreal1, gerreal2, gerreal3, gerreal4, gerreal5], join ='inner')

In [26]:
ger_real_news_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2494 entries, 0 to 599
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   2492 non-null   object
 1   content  2475 non-null   object
 2   subject  2494 non-null   object
 3   label    2494 non-null   bool  
dtypes: bool(1), object(3)
memory usage: 80.4+ KB


In [27]:
ger_real_news_complete = ger_real_news_complete.drop_duplicates()

In [28]:
ger_real_news_complete = ger_real_news_complete.dropna()

In [29]:
ger_real_news_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1865 entries, 0 to 599
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   1865 non-null   object
 1   content  1865 non-null   object
 2   subject  1865 non-null   object
 3   label    1865 non-null   bool  
dtypes: bool(1), object(3)
memory usage: 60.1+ KB


### Concat, check, and apply final cleaning steps on fake German news

Repeat the process for the german fake news that have been collected

In [30]:
gerfake1 = pd.read_csv("Input_Data/fake_ger_news_1.csv")
gerfake2 = pd.read_csv("Input_Data/fake_ger_news_2.csv")
gerfake3 = pd.read_csv("Input_Data/fake_ger_news_3.csv")
gerfake4 = pd.read_csv("Input_Data/fake_ger_news_4.csv")
gerfake5 = pd.read_csv("Input_Data/fake_ger_news_5.csv")
gerfake6 = pd.read_csv("Input_Data/fake_ger_news_6.csv")
gerfake7 = pd.read_csv("Input_Data/fake_ger_news_7.csv")

In [31]:
gerfake1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   220 non-null    object
 1   content  220 non-null    object
 2   label    220 non-null    bool  
dtypes: bool(1), object(2)
memory usage: 3.8+ KB


In [32]:
gerfake2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   1128 non-null   object
 1   content  1128 non-null   object
 2   label    1128 non-null   bool  
dtypes: bool(1), object(2)
memory usage: 18.9+ KB


In [33]:
ger_fake_news_complete = pd.concat([gerfake1, gerfake2, gerfake3, gerfake4, gerfake5, gerfake6, gerfake7], join ='inner')

In [34]:
ger_fake_news_complete = ger_fake_news_complete.drop_duplicates()

In [35]:
ger_fake_news_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1652 entries, 0 to 83
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   1652 non-null   object
 1   content  1652 non-null   object
 2   label    1652 non-null   bool  
dtypes: bool(1), object(2)
memory usage: 40.3+ KB


In [36]:
ger_news_complete = pd.concat([ger_real_news_complete, ger_fake_news_complete], join ='inner')

In [37]:
ger_news_complete = ger_news_complete.dropna()

In [38]:
ger_news_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3517 entries, 0 to 83
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   3517 non-null   object
 1   content  3517 non-null   object
 2   label    3517 non-null   bool  
dtypes: bool(1), object(2)
memory usage: 85.9+ KB


In [39]:
label_converter = {True:'Real', False:'Fake'}

ger_news_complete = ger_news_complete.replace({'label' : label_converter})

In [40]:
ger_news_complete['label'].value_counts()

Real    1865
Fake    1652
Name: label, dtype: int64

### Compile a test data set

This to have an indendent set of data to test the model a second time. For this we'll just use a kaggle data set.

In [41]:
df = pd.read_csv("Input_Data/english_data.csv")

In [42]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [43]:
df = df.rename(str.lower, axis='columns')

In [44]:
df = df.rename({'headline':'title','body':'content'}, axis='columns')

In [45]:
label_translation = {1 : 'Real', 0:'Fake'}
df = df.replace({"label": label_translation})

In [46]:
df = df.drop(columns = ['urls'])

In [47]:
df = df.dropna()

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3988 entries, 0 to 4008
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    3988 non-null   object
 1   content  3988 non-null   object
 2   label    3988 non-null   object
dtypes: object(3)
memory usage: 124.6+ KB


In [49]:
df['label'].value_counts()

Fake    2120
Real    1868
Name: label, dtype: int64

In [50]:
eng_test = df

In [51]:
!ls

Basic EDA & stop word cleaning.ipynb eng_news_clean.csv
[34mInput_Data[m[m                           ger_news_clean.csv
complete_news_clean.csv              test_bilingual.csv


In [53]:
gert1 = pd.read_csv('Input_Data/luzz.csv')
gert2 = pd.read_csv('Input_Data/orf.csv')
gert3 = pd.read_csv('Input_Data/gertest.csv')

In [54]:
gert3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   titles   119 non-null    object
 1   content  119 non-null    object
 2   subject  119 non-null    object
 3   label    119 non-null    bool  
dtypes: bool(1), object(3)
memory usage: 3.0+ KB


In [55]:
ger_test_complete =  pd.concat([gert1, gert2, gert3], join ='inner')

In [56]:
label_translation = {True : 'Real', False:'Fake'}
ger_test_complete = ger_test_complete.replace({"label": label_translation})

In [57]:
ger_test_complete = ger_test_complete.drop(columns = ['subject'])

In [58]:
ger_test_complete = ger_test_complete.rename({'titles':'title'}, axis='columns')

In [59]:
ger_test_complete.head()

Unnamed: 0,title,content,label
0,Kleine Hufeisennase vor der Rückkehr in den Ka...,Seit 1988 konnte von der Kleinen Hufeisennase ...,Real
1,Zerstörter Salat und verfaulte Kirschen: Lage ...,Die ausserordentlichen Regenfälle führen zu ei...,Real
2,"Stefan Küng verpasst um 0,40 Sekunden Bronze u...",Der Medaillentraum von Stefan Küng platzt im Z...,Real
3,Boom der fahrenden Hotelzimmer: Luzerner Wohnm...,"Flugreisen sind noch immer unsicher, Quarantän...",Real
4,Nächster Knall! Alex Wilson verpasst Olympisch...,Nach Kariem Hussein verpasst mit Alex Wilson e...,Real


In [60]:
test = pd.concat([ger_test_complete, eng_test], join ='inner')

In [61]:
test['label'].value_counts()

Fake    2120
Real    2110
Name: label, dtype: int64

### Remove stop words

Since the tfidf vectorizer, which we'll use later, only has English stopwords, we will clean the articles using nltk before feeding the data to the model.

In [62]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [63]:
german_stop_words = stopwords.words('german')

In [64]:
english_stop_words = stopwords.words('english')

In [65]:
ger_news_complete['content'] = ger_news_complete['content'].str.split(' ').apply(lambda x: ' '.join(
    [word for word in x if word.lower() not in german_stop_words]))

In [66]:
ger_news_complete['titles'] = ger_news_complete['titles'].str.split(' ').apply(lambda x: ' '.join(
    [word for word in x if word.lower() not in german_stop_words]))

In [67]:
ger_news_complete.head()

Unnamed: 0,titles,content,label
0,Corona-Pandemie: Christine Lambrecht (SPD) spr...,"Bundesinnenminister Horst Seehofer (CSU), Bund...",Real
1,"Unwetter Sachsen, Bayern Berlin: 67-Jähriger s...",Überflutete Straße bayerischen Penzberg Starkr...,Real
2,Olympia 2021: Triathlon Männer beginnt kuriose...,Panne Olympia: Teil Triathleten wurde Boot bei...,Real
3,Galeria Karstadt Kaufhof kündigt Neustart einh...,Filiale Galeria Kaufhof Köln Reihenweise Filia...,Real
4,Olympia heute – Tag drei: Nacht geschah Tag br...,Sideris Tasiadis peilt Medaille Fehlstart beim...,Real


In [68]:
eng_news_complete['content'] = eng_news_complete['content'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in english_stop_words]))

In [69]:
eng_news_complete['title'] = eng_news_complete['title'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in english_stop_words]))

In [70]:
eng_news_complete.head()

Unnamed: 0,title,content,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump wish Americans Happy New Year lea...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, revealed former Milwaukee Sheriff D...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced would...",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used annual Christmas Day message...,Fake


In [71]:
eng_news_complete['content'] = eng_news_complete['content'].replace(r'\n',' ', regex=True)

In [72]:
eng_news_complete['title'] = eng_news_complete['title'].replace(r'\n',' ', regex=True)

In [73]:
eng_news_complete.head()

Unnamed: 0,title,content,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump wish Americans Happy New Year lea...,Fake
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Fake
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, revealed former Milwaukee Sheriff D...",Fake
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced would...",Fake
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used annual Christmas Day message...,Fake


In [74]:
complete_news_clean = pd.concat([eng_news_complete, ger_news_complete], join ='inner')

In [75]:
complete_news_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48415 entries, 0 to 83
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  48415 non-null  object
 1   label    48415 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [76]:
test['content'] = test['content'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in english_stop_words]))

In [77]:
test['title'] = test['title'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in english_stop_words]))

In [78]:
test['content'] = test['content'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in german_stop_words]))

In [79]:
test['title'] = test['title'].str.split(' ').apply(lambda content:
    ' '.join([word for word in content if word not in german_stop_words]))

In [80]:
test.to_csv('test_bilingual.csv', index = False)
eng_news_complete.to_csv('eng_news_clean.csv', index = False)
ger_news_complete.to_csv('ger_news_clean.csv', index = False)
complete_news_clean.to_csv('complete_news_clean.csv', index = False)