# DWH bauen

# Import

In [None]:
import os
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
from glob import glob
from sqlalchemy import create_engine

In [None]:
stopwords_url = "https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt"
stopwords_list = requests.get(stopwords_url, allow_redirects=True).text.split("\n")[9:]

# Funktionen definieren

In [1]:
def read_html_file(file_name, encoding="utf-8"):
    with open(file_name, "r", encoding=encoding) as f:
        text = f.read()
    return text


def process_html(text):
    text = BeautifulSoup(text, "html.parser").text
    items = text.replace("\n", " ").replace("\t", " ").lower().split(" ")
    items = [i for i in items if len(i) > 2 and i not in stopwords_list]
    return items

def parse_html(name, date, file_name, encoding):
    content = read_html_file(file_name, encoding)
    soup = BeautifulSoup(content, "html.parser")
    text = soup.text
    items = process_html(text)
    item_count = pd.Series(items).value_counts()
    count = item_count.to_frame()
    count.columns = ["count"]
    count["word"] = count.index
    count["date"] = date
    count["paper"] = name
    return count

def parse_csv_files_in_directory(directory):
    result_list = []  

    csv_filename_list = glob(os.path.join(directory, "*.csv"))

    for csv_file in csv_filename_list:
        df = pd.read_csv(csv_file)

        df2 = df.dropna()
        for i, row in df2.iterrows():
            name = row["name"]
            file_name = row["file_name"]
            date = row["date"]
            encoding = row["encoding"]

            if os.path.exists(file_name):
                count = parse_html(name, date, file_name, encoding)
                
                result_list.append(count)
            else:
                print(f"Datei nicht gefunden: {file_name}")

    if result_list:
        final_result_df = pd.concat(result_list, ignore_index=True)
        return final_result_df
    else:
        return pd.DataFrame()

# Csv anpassen

In [None]:
DATA_LAKE_DIR = "input/data-lake"

behalten = ["sz", "zeit", "faz", "heise", "golem", "tagesspiegel", "taz", "abendblatt", "berliner", "welt", "esslinger", "ntv", "pioneer", "suedwest", "uebermedien", "dlf", "spiegel", "stern", "tagesschau", "wiwo"]

for file_name in os.listdir(DATA_LAKE_DIR):
    if file_name.endswith(".csv"):
        file_path = os.path.join(DATA_LAKE_DIR, file_name)
 
        # CSV-Datei einlesen
        df = pd.read_csv(file_path)

        # Überprüfen, ob die Spalte 'name' existiert
        if "name" in df.columns:
            # Behalten nur der Zeilen, deren 'name' in der Liste 'behalten' enthalten ist
            df = df[df["name"].isin(behalten)]
 
            # Speichern der bearbeiteten CSV-Datei
            df.to_csv(file_path, index=False)
        else:
            print(f"Die Datei ist leer: {file_name}")


In [None]:
# Funktion um nach bestimmten Wörtern zu filtern
def filter_words(df, column, keywords):
    
    def extract_keyword(text):
        for keyword in keywords:
            if keyword in text:
                return keyword
        return None  

    
    df[column] = df[column].apply(extract_keyword)
    return df


In [None]:
result = parse_csv_files_in_directory("input/data-lake")

In [None]:
# Die Liste der zu filternden Parteien
keywords = ['spd', 'cdu', 'afd', 'fdp', 'grünen', 'linke']

filtered_df = filter_words(result, 'word', keywords)

In [None]:
result=result.dropna().reset_index()

In [None]:
party_df=result.copy()
party_df.head()

# wahlen csv einlesen

In [None]:
wahlen_df = pd.read_csv('wahlen.csv', delimiter=';', usecols=[0,1, 2, 3, 4, 5, 6], parse_dates=[1], dayfirst=True)
wahlen_df.columns = ['date', 'CDU', 'SPD', 'Grüne', 'FDP', 'Linke', 'AfD'] 
wahlen_df['date'] = pd.to_datetime(wahlen_df['date'], format='%d.%m.%y')  

wahlen_df.head()  

# sql verbindung

In [None]:
engine = create_engine("sqlite:////../temp/meine_datenbank.db")
party_df.to_sql("party", con=engine, if_exists="replace", index=False)
wahlen_df.to_sql("wahlen", con=engine, if_exists="replace", index=False)