# Script für die Zusammenführung der verschiedenen deutschen Sentiment Lexika zu einer CSV Datei

In [None]:
import pandas as pd
import numpy as np  

In [None]:
# Merging SentiWS Positive and Negative txt files 
def combine_files(file1, file2, output_file):

    with open(file1, 'r', encoding='utf-8') as f1:
        lines1 = f1.readlines()

    with open(file2, 'r', encoding='utf-8') as f2:
        lines2 = f2.readlines()

    combined_lines = lines1 + lines2

    # sort and remove duplicates
    combined_lines = list(set(combined_lines))
    combined_lines.sort()  

    # Writing the combined lines to the output file
    with open(output_file, 'w', encoding='utf-8') as out:
        out.writelines(combined_lines)

# paths to the input files and output file
file_negative = 'SentiWS_v2.0_Negative.txt'
file_positive = 'SentiWS_v2.0_Positive.txt'
output_combined = 'SentiWS_v2.0_combined.txt'

combine_files(file_negative, file_positive, output_combined)


In [None]:
# Merging GermanPolarityClues Positives and Negatives txt files into a single txt
file_negative = 'GermanPolarityClues-Negative-21042012.tsv'
file_positive = 'GermanPolarityClues-Positive-21042012.tsv'

# Column names for the file
column_names = ["Wortform", "Lemma", "POS", "Polarität", "Sentimentwert", "Quelle"]

df_neg = pd.read_csv(file_negative, sep='\t', header=None, names=column_names)
df_pos = pd.read_csv(file_positive, sep='\t', header=None, names=column_names)

# Combine the DataFrames
df_combined = pd.concat([df_neg, df_pos], ignore_index=True)

# Save the combined DataFrame to a new TSV file
output_file = 'GermanPolarityClues_combined.tsv'
df_combined.to_csv(output_file, sep='\t', index=False, encoding='utf-8')

Nachdem die Lexika, die in mehreren Dateien vorlagen, zusammengeführt wurden, geht es jetzt darum, alle bisherigen Lexika in eine große CSV-Datei zu überführen. Dabei soll am besten keine Information verloren gehen und das Wort als eine Art ID verwendet werden. Die CSV-Datei soll danach ungefährt so aussehen:
| Wort | Worttyp | SentiWS | PolArt | GermanPolartyClues | Morph | MLSA | UniSent | AffNorms | Etc. (andere Lexika, evtl. Flexionsformen)
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Beispiel | NN | 0.355 | NEG=0.7 | negative-/-0.058 | NEG | -0.1025 | -1 | 7.786 | |

Weitere interessante Lexika umfassen: 
- **AffDict** (Schröder, 2011)  
- **Aff-Meaning** (Ambrasat et al., 2014)  
- **ALPIN** (Kolb et al., 2021)  
- **ANGST** (Schmidtke et al., 2014)  
- **BAWL-R** (Võ et al., 2009)  
- **SePL** (Rill et al., 2012) 

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET

# 1. SentiWS
sentiws = []
with open("SentiWS_v2.0_combined.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) < 3: continue
        word_tag, polarity, _ = parts
        word, worttyp = word_tag.split("|")
        sentiws.append({
            "Wort": word.lower(),
            "Worttyp": worttyp,
            "SentiWS": float(polarity)
        })
sentiws_df = pd.DataFrame(sentiws)

# 2. PolArt
polart = []
with open("PolArt_lexicon.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 3:
            word, polarity_full, _ = parts
            polart.append({
                "Wort": word.lower(),
                "PolArt": polarity_full
            })
polart_df = pd.DataFrame(polart).drop_duplicates(subset="Wort")

# 3. GermanPolarityClues
gpc_df = pd.read_csv("GermanPolarityClues_combined.tsv", sep="\t", encoding="utf-8")

pos_map = {
    "NN": "NN",
    "AD": "ADJX",
    "VV": "VVINF"
}
gpc_df["Worttyp"] = gpc_df["POS"].map(pos_map)
gpc_df = gpc_df.drop_duplicates(subset=["Lemma"])
gpc_df = gpc_df.rename(columns={
    "Lemma": "Wort",
    "Polarität": "GermanPolarityClues"
})
gpc_df["Wort"] = gpc_df["Wort"].str.lower()
gpc_df = gpc_df[["Wort", "Worttyp", "GermanPolarityClues"]]

# 4. Morph
morph = []
with open("Morph_combined.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            word_with_type, polarity = parts
            word = word_with_type.split("_")[0]
            morph.append({
                "Wort": word.lower(),
                "Morph": polarity
            })
morph_df = pd.DataFrame(morph).drop_duplicates(subset="Wort")

# 5. MLSA
mlsa_entries = []
tree = ET.parse("MLSA-presseRel.xml")
root = tree.getroot()
for entry in root.findall("entry"):
    word = entry.findtext("term")
    mlsa = None
    for opinion in entry.findall("opinion"):
        if "MLSA" in opinion.get("source"):
            polarity = float(opinion.get("polarity"))
            mlsa = polarity
    mlsa_entries.append({
        "Wort": word.lower(),
        "MLSA": mlsa
    })
mlsa_df = pd.DataFrame(mlsa_entries).drop_duplicates(subset="Wort")

# 6. UniSent
unisent = []
with open("deu_unisent_lexicon.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            word, polarity = parts
            unisent.append({
                "Wort": word.lower(),
                "UniSent": int(polarity)
            })
unisent_df = pd.DataFrame(unisent).drop_duplicates(subset="Wort")

# 7. AffNorms
affnorms_df = pd.read_csv("AffNorms_ratings.txt", sep="\t", encoding="utf-8")
affnorms_df["Wort"] = affnorms_df["Word"].str.lower()
affnorms_df = affnorms_df.rename(columns={"Val": "AffNorms_Val"})
affnorms_df = affnorms_df[["Wort", "AffNorms_Val"]].drop_duplicates(subset="Wort")

# 8. ALPIN
alpin = []
with open("ALPIN_v1.0.csv", encoding="utf-8") as f:
    next(f)  # Skip Header
    for line in f:
        word, tag, sentiment = line.strip().split(",")
        # Exclude lines in which the word is a hyperlink
        if word.startswith("http://") or word.startswith("https://"):
            continue
        alpin.append({
            "Wort": word.lower(),
            "ALPIN_sentiment_scaled": float(sentiment)
        })
alpin_df = pd.DataFrame(alpin)
alpin_df = alpin_df.drop_duplicates(subset=["Wort"])

# 9. ANGST
import openpyxl
angst_df = pd.read_excel("ANGST.xlsx", engine="openpyxl")
angst_df = angst_df.rename(columns={"G-word": "Wort", "VAL_Mean": "ANGST_Valence"})
angst_df["Wort"] = angst_df["Wort"].str.lower()
angst_df = angst_df[["Wort", "ANGST_Valence"]].drop_duplicates(subset="Wort")


# 10. AffDict
affdict_df = pd.read_excel("AffDict.xls", engine="xlrd")

# only relevant columns
affdict_df = affdict_df.rename(columns={"German Word": "Wort", "E unisex": "AffDict_Eval"})

# remove rows with NaN in "Wort"
affdict_df = affdict_df.dropna(subset=["Wort"])

# clean "Wort" column: strip whitespace, convert to lowercase, keep only alphabetic words
affdict_df["Wort"] = affdict_df["Wort"].astype(str).str.strip().str.lower()
affdict_df = affdict_df[affdict_df["Wort"].str.fullmatch(r"[a-zäöüß]+")]

affdict_df = affdict_df[["Wort", "AffDict_Eval"]]


# Merging all
master_df = sentiws_df.copy()
master_df["Wort"] = master_df["Wort"].str.lower()

dfs_to_merge = [
    polart_df, gpc_df, morph_df, mlsa_df,
    unisent_df, affnorms_df, alpin_df, nrc_df,
    angst_df, affdict_df
]

for df in dfs_to_merge:
    master_df = master_df.merge(df, on="Wort", how="outer")

# Cleaning
master_df = master_df[master_df["Wort"].notna()]  
master_df["Wort"] = master_df["Wort"].astype(str) 
# remove rows where "Wort" does not start with a letter or number
master_df = master_df[master_df["Wort"].str.match(r"^[0-9a-zäöüß]")]  

master_df = master_df.drop_duplicates(subset="Wort")

# sort and setting index
master_df = master_df.sort_values("Wort").reset_index(drop=True)
master_df.insert(0, "ID", master_df.index)

# Export
master_df.to_csv("sentiment_lexika_merged_clean.csv", index=False, encoding="utf-8")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("sentiment_lexika_merged_clean.csv")

# PolArt parsen
def parse_polart(val):
    if pd.isna(val):
        return np.nan
    try:
        typ, score = val.split("=")
        score = float(score)
        if typ == "NEG":
            return -score
        elif typ == "POS":
            return score
        elif typ == "NEU":
            return 0.0
        else:
            return np.nan  # INT, SHI... ignore
    except:
        return np.nan

df['PolArt_num'] = df['PolArt'].apply(parse_polart)

# GermanPolarityClues mapping
gpc_map = {'negative': -1, 'neutral': 0, 'positive': 1}
df['GPC_num'] = df['GermanPolarityClues'].map(gpc_map)

# Morph mapping
morph_map = {'NEG': -1, 'NEU': 0, 'POS': 1}
df['Morph_num'] = df['Morph'].map(morph_map)

# scaler 
scaler = MinMaxScaler(feature_range=(-1, 1))

# relevant columns to scale
columns_to_scale = ['AffNorms_Val', 'ANGST_Valence', 'AffDict_Eval']
existing_cols = [col for col in columns_to_scale if col in df.columns]

scaled_data = scaler.fit_transform(df[existing_cols])
scaled_df = pd.DataFrame(scaled_data, columns=[col + "_scaled" for col in existing_cols])

# combine all relevant columns
final_df = pd.concat([
    df[['ID', 'Wort']],
    df[['SentiWS', 'PolArt_num', 'GPC_num', 'Morph_num', 'MLSA', 'UniSent', 'ALPIN_sentiment_scaled']],
    scaled_df
], axis=1)

# export
final_df.to_csv("sentiment_lexika_scaled_final.csv", index=False, encoding="utf-8")

In [None]:
# # Versuch der Anreicherung des Datensatzes (Worttyp für alle Wörter) mit spacy
# import spacy
# nlp = spacy.load("de_core_news_lg")

# df = pd.read_csv("sentiment_lexika_merged.csv")

# # Funktion zur POS-Bestimmung mit spaCy
# def detect_pos_spacy(word):
#     doc = nlp(word)
#     if doc and doc[0].pos_:
#         return doc[0].pos_  # z. B. 'NOUN', 'ADJ', 'VERB', ...
#     return None

# # Neue Spalte immer befüllen 
# df["Worttyp_spacy"] = df["Wort"].apply(detect_pos_spacy)

# # Mapping auf SentiWS-kompatible Kürzel
# pos_map = {
#     "NOUN": "NN",
#     "ADJ": "ADJX",
#     "ADV": "ADJX",
#     "VERB": "VVINF",
#     "AUX": "VVINF",
# }
# df["Worttyp_spacy_mapped"] = df["Worttyp_spacy"].map(pos_map)

# # CSV speichern
# df.to_csv("sentiment_lexika_with_spacy.csv", index=False)