In [197]:
# Import needed libraries

import pandas as pd
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
from newscatcher import Newscatcher
import requests
import re

### Get cornerstone data on german news

We will use the newscatcher python wrapper, which uses the Newscatcher API to get URLs and Titles of serious news sources.

In [198]:
# Check supported german news sites

In [384]:
from newscatcher import urls

german_urls = urls(country = 'de', language = 'de')
german_urls

['spiegel.de',
 'welt.de',
 'faz.net',
 'bild.de',
 'focus.de',
 'handelsblatt.com',
 'zdf.de',
 'stern.de',
 'rp-online.de',
 'taz.de',
 'wiwo.de',
 'mdr.de',
 'stuttgarter-zeitung.de',
 'kicker.de',
 'kn-online.de',
 'wz.de']

In [200]:
# We will remove BILD (tabloid) and kicker (football newspaper)

In [201]:
remove = ['bild.de', 'kicker.de', 'ghacks.net']

In [350]:
german_urls_clean = [url for url in german_urls if url not in remove]

In [204]:
# We will add tagesschau.de as for some reason it is missing, but a supported site

german_urls_clean.extend(['tagesschau.de'])

In [351]:
# Get all the relevant corner data from newspapers
all_news = []

for source in tqdm(german_urls_clean):
    try:
        result = Newscatcher(website = source).get_news()
        articles = result['articles']
        all_news.extend(articles)
    except TypeError:
        continue

  0%|          | 0/6 [00:00<?, ?it/s]

Website is not supported


In [352]:
len(all_news)

119

Initially the idea was to also fetch subjects, but unfortunately it is something that not all news sources denote and something that would be extremely hard to reproduce for the fake news sources. For the purpose of simplicity we will keep it (there might be use for it in the future), but will not include it in the analysis.

In [354]:
subjects = []

for page in tqdm(range(len(all_news))):
    try:
        subject = all_news[page]['tags'][0]['term']
        subjects.append(subject)
    except:
        subject = 'no subject stated'
        subjects.append(subject)

  0%|          | 0/119 [00:00<?, ?it/s]

In [355]:
len(subjects)

119

In [356]:
all_news[0]['title']

'"Hol die Kameltreiber": Rassismus-Eklat um deutschen Olympia-Trainer'

In [357]:
len(titles)

73

In [358]:
all_news[0]['link']

'https://kurier.at/sport/olympia-ticker-alles-wichtige-zu-den-olympischen-sommerspielen-2020-in-tokio/401455036'

### Get all the pages that I want to scrape for content

We will simply fetch all the URLs for all articles we were able to fetch from the Newscatcher.

In [359]:
pages = []

for page in tqdm(range(len(all_news))):
    link = all_news[page]['link']
    pages.append(link)

  0%|          | 0/119 [00:00<?, ?it/s]

In [360]:
len(pages)

119

In [361]:
pages[0]

'https://kurier.at/sport/olympia-ticker-alles-wichtige-zu-den-olympischen-sommerspielen-2020-in-tokio/401455036'

### Get news content from all the pages

Now with all the article URLS we will use requests to get the pages' content and do so with a little sleep time in between to scrape a bit more respectfully. ;)

We'll then parse the content with BeautifulSoup under the assumption that we can get high_enough quality content by selecting all the paragragph elements. It might be the case that we will scrape some irrelevant paragraphs with it (i.e. text that is not part for the core artcile), but should have high enough quality overall as well as a standardized approach across all news sites.

In [362]:
from time import sleep
from random import randint

In [None]:
page_content = []

for i in tqdm(range(len(pages))):
    url = str(pages[i])
    response = requests.get(url)
    page_content.append(response)
    wait_time = randint(1,3)
    sleep(wait_time)

In [364]:
page_content[:5]

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

In [365]:
paragraphs = []

for page in tqdm(range(len(page_content))):
    paragraph = BeautifulSoup(page_content[page].content, "html.parser")
    paragraphs.append(paragraph)

  0%|          | 0/119 [00:00<?, ?it/s]

In [367]:
p_tags = []

for paragraph in tqdm(range(len(paragraphs))):
    tags = paragraphs[paragraph].find_all('p')
    p_tags.append(tags)

  0%|          | 0/119 [00:00<?, ?it/s]

In [None]:
for p in p_tags[0]:
    print(p.get_text())

In [None]:
[i for i in p_tags]

In [370]:
final_content = ["\n".join(p.get_text() for p in i) for i in p_tags]

In [371]:
len(final_content)

119

In [372]:
titles = []

for page in tqdm(range(len(pages))):
    title = all_news[page]['title']
    titles.append(title)

  0%|          | 0/119 [00:00<?, ?it/s]

### Create the data frame

We will now create a data frame for real news sources. I have repeated this process 4 days in a row to get enough input to train our model on german vocabulary later as well.

In [374]:
real_ger_news = pd.DataFrame(titles, columns=['titles'])

In [375]:
real_ger_news.apply(lambda col: col.drop_duplicates().reset_index(drop=True))

Unnamed: 0,titles
0,"""Hol die Kameltreiber"": Rassismus-Eklat um deu..."
1,Nö
2,Situation am Arbeitsmarkt entspannte sich im e...
3,Wie viel der Grüne Pass gekostet hat
4,"Boris Johnsons Tochter zeigt, wie sexy Shapewe..."
...,...
114,"Drei Schuld- und zwei Freisprüche im ""Terrorpr..."
115,Frankreich: Justiz ermittelt gegen Ex-Minister...
116,Dortmund holt Niederlande-Stürmer Malen
117,Portugal bittet EU-Staaten um Corona-Impfdosen


In [376]:
real_ger_news.head()

Unnamed: 0,titles
0,"""Hol die Kameltreiber"": Rassismus-Eklat um deu..."
1,Nö
2,Situation am Arbeitsmarkt entspannte sich im e...
3,Wie viel der Grüne Pass gekostet hat
4,"Boris Johnsons Tochter zeigt, wie sexy Shapewe..."


In [377]:
real_ger_news['content'] = final_content

In [378]:
real_ger_news['subject'] = subject

In [379]:
real_ger_news['label'] = 'True'

In [380]:
real_ger_news.head()

Unnamed: 0,titles,content,subject,label
0,"""Hol die Kameltreiber"": Rassismus-Eklat um deu...",info\n© APA/AFP/POOL/TIM DE WAELE / TIM DE WAE...,no subject stated,True
1,Nö,Im „Kulturmontag spezial“ aus Salzburg wurden ...,no subject stated,True
2,Situation am Arbeitsmarkt entspannte sich im e...,info\n© stokkete - stock.adobe.com / stokkete/...,no subject stated,True
3,Wie viel der Grüne Pass gekostet hat,info\n© APA/HELMUT FOHRINGER\nGesundheitsminis...,no subject stated,True
4,"Boris Johnsons Tochter zeigt, wie sexy Shapewe...",info\nPrime Minister Boris Johnson ist offizie...,no subject stated,True


In [381]:
real_ger_news['content'] = real_ger_news['content'].replace(r'\n',' ', regex=True)

In [382]:
real_ger_news.head(5)

Unnamed: 0,titles,content,subject,label
0,"""Hol die Kameltreiber"": Rassismus-Eklat um deu...",info © APA/AFP/POOL/TIM DE WAELE / TIM DE WAEL...,no subject stated,True
1,Nö,Im „Kulturmontag spezial“ aus Salzburg wurden ...,no subject stated,True
2,Situation am Arbeitsmarkt entspannte sich im e...,info © stokkete - stock.adobe.com / stokkete/s...,no subject stated,True
3,Wie viel der Grüne Pass gekostet hat,info © APA/HELMUT FOHRINGER Gesundheitsministe...,no subject stated,True
4,"Boris Johnsons Tochter zeigt, wie sexy Shapewe...",info Prime Minister Boris Johnson ist offiziel...,no subject stated,True


In [383]:
real_ger_news.to_csv('gertest.csv', index = False)

In [344]:
!ls

German News Scrapping.ipynb luzz.csv
GermanFakeNC.json           orf.csv
fake_ger_news_1.csv         politifact_fake.txt
fake_ger_news_2.csv         politifact_real.txt
fake_ger_news_3.csv         real_ger_news_1.csv
fake_ger_news_4.csv         real_ger_news_2.csv
fake_ger_news_5.csv         real_ger_news_3.csv
fake_ger_news_6.csv         real_ger_news_4.csv
fake_ger_news_7.csv         real_ger_news_5.csv


In [None]:
pd.read_csv('orf.csv').head()

### Get Fake News from JSON file

Initially the idea was to train the model in German by using a JSON dictionary which was used for [academic reasearch](https://www.springerprofessional.de/en/fake-news-detection-with-the-new-german-dataset-germanfakenc/17153988) on the topic of fake news detection.

However, after cleaning the data, it became clear that only 220 articles are usable, which led to additional scraping from 2 additional sites publishing misleading news.

In [280]:
import json

In [282]:
with open('GermanFakeNC.json') as json_file:
    data = json.load(json_file)

In [298]:
data[2].keys()

dict_keys(['Date', 'URL', 'False_Statement_1_Location', 'False_Statement_1_Index', 'False_Statement_2_Location', 'False_Statement_2_Index', 'False_Statement_3_Location', 'False_Statement_3_Index', 'Ratio_of_Fake_Statements', 'Overall_Rating'])

In [294]:
data[2]['URL']

'http://blauerbote.com/2017/06/02/angela-merkel-laesst-rock-am-ring-abbrechen/'

In [295]:
fake_urls = []

for entry in tqdm(range(len(data))):
    fake_urls.append(data[entry]['URL'])

  0%|          | 0/490 [00:00<?, ?it/s]

In [296]:
fake_urls

['https://schluesselkindblog.com/2017/08/30/prozess-beginnt-mord-an-heidelberger-studentin/',
 'http://blauerbote.com/2017/12/18/bild-journalist-julian-roepcke-und-seine-nazifreunde/',
 'http://blauerbote.com/2017/06/02/angela-merkel-laesst-rock-am-ring-abbrechen/',
 'http://smopo.ch/deutschlands-neonazis-waehlen-den-untergang/',
 'http://www.truth24.net/gruppenvergewaltigung-sex-jihadisten-vergewaltigen-junge-deutsche-auf-heimweg/',
 'http://www.rapefugees.net/ozapft-is-tuerke-versucht-koreanerin-auf-die-wiesn-zu-vergewaltigen-oktoberfest-startet-dreckig-muenchen/',
 'https://blog.halle-leaks.de/dresden-wieder-kind-durch-fluechtling-brutal-vergewaltigt-wie-lange-noch/',
 'https://www.unzensuriert.at/content/0026189-Katalonien-100-Tage-Diktatur-des-spanischen-Zentralstaats',
 'http://www.guidograndt.de/2018/01/22/kollegenbeitrag-waehler-verraeter-spd/',
 'https://de.sott.net/article/31971-Umfrage-92-der-Ukrainer-wollen-russischen-Fuhrer-zuruck-Schnauze-voll-von-Poroschenko',
 'http://i

In [301]:
# Get all fake news
fake_page_content = []

for i in tqdm(range(len(fake_urls))):
    try:
        url = str(fake_urls[i])
        response = requests.get(url)
        fake_page_content.append(response)
        wait_time = randint(1,3)
    except:
        continue

  0%|          | 0/490 [00:00<?, ?it/s]

In [304]:
fake_page_content[5:]

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [403]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [406]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [404]>,
 <Response [200]>,
 <Response [200]>,
 <Response [

In [308]:
fake_page_content = [page for page in fake_page_content if page.status_code == 200]

In [309]:
len(fake_page_content)

328

In [310]:
fake_content = []

for page in tqdm(range(len(fake_page_content))):
    content = BeautifulSoup(fake_page_content[page].content, "html.parser")
    fake_content.append(content)

  0%|          | 0/328 [00:00<?, ?it/s]

In [390]:
fake_p_tags = []

for paragraph in tqdm(range(len(fake_content))):
    tags = fake_content[paragraph].find_all('p')
    fake_p_tags.append(tags)

  0%|          | 0/328 [00:00<?, ?it/s]

In [None]:
fake_p_tags

In [393]:
final_fake_content = ["\n".join(p.get_text() for p in i) for i in fake_p_tags]

In [395]:
len(final_fake_content)

328

In [411]:
fake_titles = []

for title in tqdm(range(len(fake_content))):
    tags = fake_content[title].find_all('h1')
    fake_titles.append(tags)

  0%|          | 0/328 [00:00<?, ?it/s]

In [412]:
fake_titles

[[],
 [<h1 class="entry-title">BILD-Journalist Julian Röpcke und seine Nazifreunde</h1>],
 [<h1 class="entry-title">Angela Merkel läßt Rock am Ring abbrechen</h1>],
 [<h1 class="post-title entry-title">
  			Gruppenvergewaltigung: Sex Jihadisten vergewaltigen junge Deutsche auf Heimweg			        </h1>,
  <h1>Bad Soden / Hofheim. Beim Herumvagabundieren überfielen die beiden Araber einfach eine junge Frau, sie wollte nur nach Hause</h1>],
 [<h1 class="entry-title">Ozapft Is! Türke versucht Koreanerin auf die Wiesn zu vergewaltigen – Oktoberfest startet dreckig | München</h1>,
  <h1>Bereits der erste Tag startet wieder mit einer versuchten Vergewaltigung durch einen Moslem, die Tagesschau berichtet es sei alles friedlich gestartet</h1>],
 [],
 [],
 [<h1 class="site-title"> <a href="http://www.guidograndt.de/" rel="home">GUIDO GRANDT </a></h1>,
  <h1 class="title single"> <a title="Permalink zu: KOLLEGENBEITRAG: “Wähler-Verräter SPD?!”">
                    KOLLEGENBEITRAG: “Wähler-Verrät

In [423]:
final_fake_titles = ["\n".join(p.get_text() for p in i) for i in fake_titles]

In [431]:
final_fake_titles[325]

''

In [415]:
len(final_fake_titles)

328

In [424]:
fake_ger_news = pd.DataFrame(final_fake_titles, columns=['titles'])

In [425]:
fake_ger_news['content'] = final_fake_content

In [428]:
fake_ger_news['label'] = 'False'

Since these articles are a bit messier than those from more realiable news sources, we'll apply some rudimentary cleaning operations to them via regex on top.

In [436]:
fake_ger_news['content'] = fake_ger_news['content'].replace(r'\n',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_ger_news['content'] = fake_ger_news['content'].replace(r'\n',' ', regex=True)


In [437]:
fake_ger_news['content'] = fake_ger_news['content'].replace(r'\t',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_ger_news['content'] = fake_ger_news['content'].replace(r'\t',' ', regex=True)


In [441]:
fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\n',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\n',' ', regex=True)


In [442]:
fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\t',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\t',' ', regex=True)


In [443]:
fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\r',' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_ger_news['titles'] = fake_ger_news['titles'].replace(r'\r',' ', regex=True)


In [444]:
fake_ger_news

Unnamed: 0,titles,content,label
1,BILD-Journalist Julian Röpcke und seine Nazifr...,Blauer Bote Magazin – Wissenschaft statt Propa...,False
2,Angela Merkel läßt Rock am Ring abbrechen,Blauer Bote Magazin – Wissenschaft statt Propa...,False
3,Gruppenvergewaltigung: Sex Jihadisten ver...,Suchen « Er wollte sie im Fluss ertränken: ...,False
4,Ozapft Is! Türke versucht Koreanerin auf die W...,Rapefugees.net Vergewaltigungen melden ungesc...,False
7,GUIDO GRANDT KOLLEGENBE...,"Publizist, Autor & investigativer Journalist S...",False
...,...,...,...
320,Mehrere Sexattacken: Armutsasylanten über...,Suchen « 572 Fälle von Genitalverstümmelung...,False
322,GUIDO GRANDT “SIE KOMME...,"Publizist, Autor & investigativer Journalist ...",False
323,GUIDO GRANDT AMTLICH: D...,"Publizist, Autor & investigativer Journalist ...",False
324,Sex-Dschihad in Essen: 200 „Flüchtlinge“ stürm...,Merkels “Fachkräfte” haben erneut zugeschlagen...,False


In [445]:
fake_ger_news.to_csv('fake_ger_news_1.csv', index = False)

### Scrape individual fake news websites to increase sample size

Since the fake news websites in the JSON all have vastly different structures coming up with a standardized approach was not a real option.

So, the final approach was to scrape data from 2 sites (one German and one Austrian) that have shown a more professional setup and a structure, which is straightforward to scrape.

In [57]:
# We will be able to repeat this for every month, since the website has an acceptable structure

url = 'http://blauerbote.com/2021/07/'

In [58]:
response=requests.get(url)
response.status_code

200

In [59]:
bbote = BeautifulSoup(response.content, "html.parser")

In [63]:
print(bbote.prettify())

<!DOCTYPE html>
<html lang="de-DE">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="http://gmpg.org/xfn/11" rel="profile"/>
  <link href="http://blauerbote.com/xmlrpc.php" rel="pingback"/>
  <title>
   Juli 2021 – Blauer Bote Magazin – Wissenschaft statt Propaganda
  </title>
  <link href="//fonts.googleapis.com" rel="dns-prefetch">
   <link href="//s.w.org" rel="dns-prefetch">
    <link href="http://blauerbote.com/feed/" rel="alternate" title="Blauer Bote Magazin - Wissenschaft statt Propaganda » Feed" type="application/rss+xml"/>
    <link href="http://blauerbote.com/comments/feed/" rel="alternate" title="Blauer Bote Magazin - Wissenschaft statt Propaganda » Kommentar-Feed" type="application/rss+xml"/>
    <script type="text/javascript">
     window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/2.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/2.3\/svg\/","svgEx

In [80]:
# create the url string for the page search
# we don't really need a complex iterations variable, since the url-structure is very simple

iterations = range(2,6,1)

july = ['http://blauerbote.com/2021/07']

for i in iterations:
        start_at =str(i)
        url='http://blauerbote.com/2021/07/page/' + start_at + '/'
        july.append(url)

In [81]:
july

['http://blauerbote.com/2021/07',
 'http://blauerbote.com/2021/07/page/2/',
 'http://blauerbote.com/2021/07/page/3/',
 'http://blauerbote.com/2021/07/page/4/',
 'http://blauerbote.com/2021/07/page/5/']

In [91]:
def get_pages(l):
    for i in tqdm(iterations):
        start_at ='/page/' + str(i) + '/'
        url= l[0] + start_at
        l.append(url)
    return l

In [88]:
june = ['http://blauerbote.com/2021/06/']
may = ['http://blauerbote.com/2021/05/']
april = ['http://blauerbote.com/2021/04/']
march = ['http://blauerbote.com/2021/03/']
february = ['http://blauerbote.com/2021/02/']
january = ['http://blauerbote.com/2021/01/']
december = ['http://blauerbote.com/2020/12/']
november = ['http://blauerbote.com/2020/11/']

In [92]:
get_pages(june)
get_pages(may)
get_pages(april)
get_pages(march)
get_pages(february)
get_pages(january)
get_pages(december)
get_pages(november)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

['http://blauerbote.com/2020/11/',
 'http://blauerbote.com/2020/11//page/2/',
 'http://blauerbote.com/2020/11//page/3/',
 'http://blauerbote.com/2020/11//page/4/',
 'http://blauerbote.com/2020/11//page/5/']

In [93]:
import itertools

In [94]:
blau_links = itertools.chain(july, june, may, april, march, february, january, december, november)

In [97]:
blau_links = list(blau_links)

In [98]:
blau_links

['http://blauerbote.com/2021/07',
 'http://blauerbote.com/2021/07/page/2/',
 'http://blauerbote.com/2021/07/page/3/',
 'http://blauerbote.com/2021/07/page/4/',
 'http://blauerbote.com/2021/07/page/5/',
 'http://blauerbote.com/2021/06/',
 'http://blauerbote.com/2021/06//page/2/',
 'http://blauerbote.com/2021/06//page/3/',
 'http://blauerbote.com/2021/06//page/4/',
 'http://blauerbote.com/2021/06//page/5/',
 'http://blauerbote.com/2021/06//page/2/',
 'http://blauerbote.com/2021/06//page/3/',
 'http://blauerbote.com/2021/06//page/4/',
 'http://blauerbote.com/2021/06//page/5/',
 'http://blauerbote.com/2021/05/',
 'http://blauerbote.com/2021/05//page/2/',
 'http://blauerbote.com/2021/05//page/3/',
 'http://blauerbote.com/2021/05//page/4/',
 'http://blauerbote.com/2021/05//page/5/',
 'http://blauerbote.com/2021/05//page/2/',
 'http://blauerbote.com/2021/05//page/3/',
 'http://blauerbote.com/2021/05//page/4/',
 'http://blauerbote.com/2021/05//page/5/',
 'http://blauerbote.com/2021/04/',
 'htt

In [99]:
pages =[]

for url in tqdm(blau_links):
    response = requests.get(url)
    print("status=" + str(response.status_code))
    pages.append(response) 
    wait_time=randint(1,3)
    sleep(wait_time)

  0%|          | 0/53 [00:00<?, ?it/s]

status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=404
status=200
status=200
status=200
status=404
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=404
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200
status=200


In [100]:
pages = [page for page in pages if page.status_code == 200]

In [184]:
def page_requests(l):
    
    pages_raw =[]

    for url in tqdm(range(len(l))):
        response = requests.get(url)
        print("status=" + str(response.status_code))
        pages.append(response) 
        wait_time= 0.1
        sleep(wait_time)

In [101]:
len(pages)

50

In [103]:
page1 = BeautifulSoup(pages[0].content,"html.parser")

In [119]:
page1.select('a.more-link')[0]['href']

'http://blauerbote.com/2021/07/26/the-recommended-genetic-manipulation-of-humans/'

In [127]:
content = []

for page in tqdm(range(len(pages))):
    contents = BeautifulSoup(pages[page].content,"html.parser")
    content.append(contents)

  0%|          | 0/50 [00:00<?, ?it/s]

In [140]:
hrefs = []

for link in content:
    raw_link = link.select('a.more-link')
    for i in range(len(raw_link)):
        href = raw_link[i]['href']
        hrefs.append(href)

In [190]:
hrefs[0]

'http://blauerbote.com/2021/07/26/the-recommended-genetic-manipulation-of-humans/'

In [192]:
pages_raw =[]

for url in tqdm(hrefs):
    response = requests.get(url)
    pages_raw.append(response) 
    wait_time= 0.1
    sleep(wait_time)

  0%|          | 0/1128 [00:00<?, ?it/s]

In [194]:
len(pages_raw)

1128

In [204]:
fake_paragaphs = get_content(pages_raw)

  0%|          | 0/1128 [00:00<?, ?it/s]

In [206]:
fake_contents = []

for page in tqdm(range(len(pages_raw))):
    paragraph = BeautifulSoup(pages_raw[page].content, "html.parser")
    fake_contents.append(paragraph)

  0%|          | 0/1128 [00:00<?, ?it/s]

In [207]:
len(fake_contents)

1128

In [None]:
# header > h1 (header)
# div.entry-content.clearfix (text)

In [216]:
fake_contents[0].select('header > h1')[0].get_text()

'The recommended genetic manipulation of humans'

In [None]:
fake_contents[0].select('div.entry-content.clearfix')[0].get_text()

In [225]:
bbl_fake_titles = []
bbl_fake_content = []

for article in tqdm(range(len(fake_contents))):
    title = fake_contents[article].select('header > h1')[0].get_text()
    content = fake_contents[article].select('div.entry-content.clearfix')[0].get_text()
    bbl_fake_titles.append(title)
    bbl_fake_content.append(content)

  0%|          | 0/1128 [00:00<?, ?it/s]

In [226]:
print(len(bbl_fake_titles))
print(len(bbl_fake_content))

1128
1128


In [227]:
fake_bbl_news = pd.DataFrame(bbl_fake_titles, columns=['titles'])

In [229]:
fake_bbl_news['content'] = bbl_fake_content

In [231]:
fake_bbl_news['label'] = 'False'

In [234]:
fake_bbl_news.head()

Unnamed: 0,titles,content,label
0,The recommended genetic manipulation of humans,\nThe WHO has published official recommendatio...,False
1,Zensur: Google-Suche zeigt kenfm.de nicht mehr an,\nWer „kenfm“ oder „kenfm.de“ in die Google-Su...,False
2,Professor Krause: „Labordaten allein sollten n...,\n„Labordaten allein sollten nicht unser Hande...,False
3,Deutschland boykottiert Anti-Rassismus-Konfere...,\nDeutschland boykottiert Anti-Rassismus-Konfe...,False
4,Tatsächlich ist nicht das System dumm und fade...,\n\nTherapien: Tatsächlich ist nicht das Syste...,False


In [235]:
fake_bbl_news['content'] = fake_bbl_news['content'].replace(r'\n',' ', regex=True)

In [236]:
fake_bbl_news.head()

Unnamed: 0,titles,content,label
0,The recommended genetic manipulation of humans,The WHO has published official recommendation...,False
1,Zensur: Google-Suche zeigt kenfm.de nicht mehr an,Wer „kenfm“ oder „kenfm.de“ in die Google-Suc...,False
2,Professor Krause: „Labordaten allein sollten n...,„Labordaten allein sollten nicht unser Handel...,False
3,Deutschland boykottiert Anti-Rassismus-Konfere...,Deutschland boykottiert Anti-Rassismus-Konfer...,False
4,Tatsächlich ist nicht das System dumm und fade...,Therapien: Tatsächlich ist nicht das System ...,False


In [237]:
fake_bbl_news.to_csv('fake_ger_news_2.csv', index = False)

In [264]:
ls

German News Scrapping.ipynb  real_ger_news_1
GermanFakeNC.json            real_ger_news_1.csv
fake_ger_news_1.csv          real_ger_news_2.csv
fake_ger_news_2.csv          real_ger_news_3.csv


### Getting additional fake news articles to drive model accuracy

We will take articles from unzensuriert as well. They are supported by the radical right FPÖ in Austria, but with a more eloquent use of language than other sites we scraped for our fake news. This will come in handy to have more diverse vocabulary to train our tool on later

In [290]:
url = 'https://www.unzensuriert.at/hassimnetz/'

In [291]:
response=requests.get(url)
response.status_code

200

In [292]:
unzens = BeautifulSoup(response.content, "html.parser")

In [293]:
raw_link = unzens.select('a.searched-article-title')

In [294]:
raw_link[1]

<a class="searched-article-title" href="https://www.unzensuriert.at/content/120228-neujahrs-aufruf-der-linksextremen-wir-sind-die-die-bullenautos-anzuenden/">„Wir sind die, die Bullenautos anzünden“</a>

In [295]:
hrefs = []

for i in range(len(raw_link)):
    href = raw_link[i]['href']
    hrefs.append(href)

In [296]:
pages_raw =[]

for url in tqdm(hrefs):
    response = requests.get(url)
    pages_raw.append(response) 
    wait_time= 0.1
    sleep(wait_time)

  0%|          | 0/84 [00:00<?, ?it/s]

In [297]:
fake_contents = []

for page in tqdm(range(len(pages_raw))):
    paragraph = BeautifulSoup(pages_raw[page].content, "html.parser")
    fake_contents.append(paragraph)

  0%|          | 0/84 [00:00<?, ?it/s]

In [298]:
unzens_fake_titles = []
unzens_fake_content = []

for article in tqdm(range(len(fake_contents))):
    title = fake_contents[article].select('span.single-big-article-title')[0].get_text()
    content = fake_contents[article].select('div.single-big-article-content-wrapper > div.single-big-article-content-title-tag-box')[0].get_text()
    unzens_fake_titles.append(title)
    unzens_fake_content.append(content)

  0%|          | 0/84 [00:00<?, ?it/s]

In [299]:
unzens_pol_news = pd.DataFrame(unzens_fake_titles, columns=['titles'])

In [300]:
unzens_pol_news['content'] = unzens_fake_content

In [301]:
unzens_pol_news['label'] = 'False'

In [302]:
unzens_pol_news.head()

Unnamed: 0,titles,content,label
0,Von wegen “Marktplatz der Meinungen”: Twitter ...,Meinungsfreiheit 9. Jänner 2021 / 11:15Von ...,False
1,Neujahrs-Aufruf der Linksextremen: „Wir sind d...,linksextreme Gewalt 8. Jänner 2021 / 09:25N...,False
2,Islamist droht mit Mord – doch der Staatsanwal...,Justiz 30. Oktober 2020 / 11:02Islamist dro...,False
3,FPÖ-Zensur auf “Facebook”: FPÖ fragt Kurz nach...,Facebook 5. Oktober 2020 / 10:54FPÖ-Zensur ...,False
4,Neos-Mandatar und Rechtsanwalt(!) beleidigt FP...,Justiz 19. September 2020 / 06:00Neos-Manda...,False


In [303]:
unzens_pol_news['content'] = unzens_pol_news['content'].replace(r'\n',' ', regex=True)

In [304]:
unzens_pol_news.head()

Unnamed: 0,titles,content,label
0,Von wegen “Marktplatz der Meinungen”: Twitter ...,Meinungsfreiheit 9. Jänner 2021 / 11:15Von ...,False
1,Neujahrs-Aufruf der Linksextremen: „Wir sind d...,linksextreme Gewalt 8. Jänner 2021 / 09:25N...,False
2,Islamist droht mit Mord – doch der Staatsanwal...,Justiz 30. Oktober 2020 / 11:02Islamist dro...,False
3,FPÖ-Zensur auf “Facebook”: FPÖ fragt Kurz nach...,Facebook 5. Oktober 2020 / 10:54FPÖ-Zensur ...,False
4,Neos-Mandatar und Rechtsanwalt(!) beleidigt FP...,Justiz 19. September 2020 / 06:00Neos-Manda...,False


In [305]:
unzens_pol_news.to_csv('fake_ger_news_7.csv', index = False)