## Load real corona news and data to index

In [1]:
import pandas as pd
import os
import random
import json

In [2]:
data_path = "../data/"
fake_news_path = os.path.join(data_path+"fake_news/", "fake_news_corona.csv")

news_data_frame = pd.read_csv(fake_news_path, sep=";", encoding="utf8", names=["fake","real","real_url"])
news_data_frame.head()

Unnamed: 0,fake,real,real_url
0,Dort ruft eine Frau zu Hamsterkäufen auf. „Bev...,Märkte öffnen ganz normal. Wir wollen für unse...,https://www.bild.de/politik/inland/politik-inl...
1,"Ausgangssperre ab Montag, den 23.03.2020 für B...",,https://www.bild.de/politik/inland/politik-inl...
2,Arbeitsverbot ab dem 16.03.-21.03 wegen corona...,Zahlreiche Berufe können bzw. müssen weiter au...,https://www.bild.de/politik/inland/politik-inl...
3,"Eine Person mit laufender Nase mit Auswurf, ha...",Eine Person mit laufender Nase mit Auswurf: ei...,https://www.bild.de/politik/inland/politik-inl...
4,Das Coronavirus stirbt bei 26 bis 27 Grad.,"Es ist noch nicht klar, ob wärmere Temperature...",https://www.bild.de/politik/inland/politik-inl...


## Create dataset

In [3]:
dataset_path = "../data/preprocessed/"
data_file = os.path.join(dataset_path, "fake_news_train.tsv")

data_frame = pd.DataFrame(columns=["text", "label"])
for index, row in news_data_frame.iterrows():
    fake = row["fake"]
    data_frame = data_frame.append({"text": fake, "label": "fake"}, ignore_index=True)
    
    real = row["real"]
    if not pd.isna(real):
        data_frame = data_frame.append({"text": real, "label": "real"}, ignore_index=True)
        
    real_url = row["real_url"]

data_frame.to_csv(data_file, sep="\t", encoding="utf8", index=False)

## Create mock jsons

In [4]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="specify_your_app_name_here")
cities = ["Berlin", "München", "Hamburg", "Stuttgart", "Köln", "Heinsberg", "Bremen", "Potsdam", "Mannheim", "Darmstadt", "Kaiserslautern", "Nürnberg", "Freiburg"]
locations = []
rand_lat = (0.5 - random.random()) * 0.01
rand_long = (0.5 - random.random()) * 0.0001
for city in cities:
    location = geolocator.geocode(city)
    location.latitude += rand_lat
    location.longitude += rand_long
    locations.append(location)

AttributeError: can't set attribute

In [None]:
jsons = list()
for index, row in news_data_frame.iterrows():
    template = dict()
    fake = row["fake"]
    real = row["real"]
    real_url = row["real_url"]
    
    template["text"] = fake
    fake_prob = random.random()
    fake_prob = max(1-fake_prob, fake_prob)
    
    template["classification"] = {
        "fake": fake_prob,
        "unknown": 0.0,
        "real": 1-fake_prob
    }
    
    template["evidence"] = []
    if not pd.isna(real):
        template["evidence"].append({
            "title": "Real title",
            "text": real,
            "url": real_url if pd.isna(real_url) else None,
            "for_class": "real"
        })
        
    location = random.choice(locations)
    template["derived"] = dict()
    template["derived"]["locations"] = [{
          "country": "Deutschland",
          "country_code": "DE",
          "locality": "Deutschland",
          "region": "Bundesland",
          "sub_region": "Landkreis",
          "full_name": str(location),
          "geo": {
            "coordinates": [
                location.latitude, 
                location.longitude
            ],
            "type": "point"
          }
        } 
    ]
    
    jsons.append(template)

# save list in file
with open("../data/mock_jsons/mock_jsons.json","w+", encoding="utf8", newline='') as json_file:
    json.dump(jsons, json_file, indent=2, ensure_ascii=False)