In [6]:
import re
import spacy
#!python -m spacy download de_core_news_md
#!python -m spacy download en_core_web_lg 
en_nlp = spacy.load("en_core_web_lg")
#de_nlp = spacy.load("de_core_news_md")

import nltk
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download('vader_lexicon')
# nltk.download("averaged_perceptron_tagger")
##from nltk import pos_tag, pos_tag_sents, word_tokenize, sent_tokenize
##from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load

import warnings
warnings.filterwarnings("ignore")

In [122]:
df_deutsch = pd.read_csv("deutsch_stances.csv", index_col = 0)
df_deutsch.reset_index(inplace=True, drop = True)

In [123]:
df_english = pd.read_csv("english_stances.csv", index_col = 0)
df_english.reset_index(inplace=True, drop = True)

In [151]:
def stemmer(text):
        tokens = nltk.word_tokenize(text)
        stems = []
        for item in tokens:
            if item.isdigit():
                continue
            elif item.isalnum():
                stems.append(PorterStemmer().stem(item))
        return stems

def clean_text(text):
    website_pattern = re.compile(r'\((.*?)\)') 
    slash_pattern = re.compile(r'[\[\]]')
    text = re.sub(website_pattern, "", text)
    text = re.sub(slash_pattern, "", text)
    return text

def generate_base(df, column, language, model = "glove"):
    lang = "en" if language == "english" else "de"
    
    if model == "glove":
        nlp = en_nlp if language == "english" else de_nlp
        embeddings = np.array([nlp(x).vector for x in list(df[column].values)])
        shape = embeddings.shape[1]
        columns = ["{}_dimension_{}".format(column, i) for i in range(shape)]
        ff = pd.DataFrame(data=embeddings, columns=columns)
        
    elif model == "tfidf-stemmed":
        model = load("tfidf_"+ lang +"_stemmed.sav")
        data = model.transform(df[column])
        columns = [column +":" + col for col in model.get_feature_names()]
        ff = pd.SparseDataFrame(data = data, columns=columns).fillna(0)
        
    elif model == "tfidf-unstemmed":
        model = load("tfidf_"+ lang +"_unstemmed.sav")
        data = model.transform(df[column])
        columns = [column +":" + col for col in model.get_feature_names()]
        ff = pd.SparseDataFrame(data = data, columns=columns).fillna(0)
        
    return ff

def generate_additional(df, column, language, modes = ["pos", "ner", "sentiment"]):
    nlp = en_nlp if language == "english" else de_nlp
    docs = [nlp(x) for x in list(df[column].values)]
    n = len(df)
    dfs = []
    
    # use pos tags
    if "pos" in modes:
        # research tag by using spacy.explain: spacy.explain("ADP")
        pos_tags = {"PRON": [0]*n, 
                    "ADV": [0]*n, 
                    "ADJ": [0]*n, 
                    "ADP": [0]*n,
                    "DET": [0]*n,
                    "AUX": [0]*n,
                    "VERB": [0]*n, 
                    "NOUN": [0]*n, 
                    "PUNCT": [0]*n, 
                    "NUM": [0]*n}

        for i, doc in enumerate(docs):
            for token in doc:
                if token.pos_ in pos_tags.keys():
                    pos_tags[token.pos_][i] += 1
        tf = pd.DataFrame.from_dict(pos_tags)
        tf.columns = [column +":" + col for col in tf.columns]
        dfs.append(tf)
    
    # use sentiment tas: negative, neutral, positive and compound
    if "sentiment" in modes:
        sentiment = [sid.polarity_scores(x) for x in list(df[column].values)]
        tf = pd.DataFrame(data=sentiment)
        tf.columns = [column +":" + col for col in tf.columns]
        dfs.append(tf)
        
    # use named entity recognition:
    if "ner" in modes:
        ner_types = {"PERSON": [0]*n, 
                    "NORP": [0]*n, 
                    "FAC": [0]*n, 
                    "ORG": [0]*n,
                    "GPE": [0]*n,
                    "LOC": [0]*n,
                    "PRODUCT": [0]*n, 
                    "EVENT": [0]*n, 
                    "WORK_OF_ART": [0]*n, 
                    "LAW": [0]*n,
                    "LANGUAGE": [0]*n, 
                    "QUANITY": [0]*n,
                    "ORDINAL": [0]*n, 
                    "CARDINAL": [0]*n}
        for i, doc in enumerate(docs):
            for entity in doc.ents:
                if entity.label_ in ner_types.keys():
                    ner_types[entity.label_][i] += 1
        tf = pd.DataFrame.from_dict(ner_types)
        tf.columns = [column +":" + col for col in tf.columns]
        dfs.append(tf)
    if "structure" in modes:
        pass
    
    return pd.concat(dfs, axis=1)

def prep_dataset(df, model, language, modes = []):
    dfs = []
    df_stance = pd.concat([df['stance']], axis=1)
    df_stance['stance'] = df_stance.stance.apply(lambda x: 1 if x == "RA" else 0)
    dfs.append(df_stance)
    #dfs.append(generate_base(df, "child_text", model=model, language=language))
    #dfs.append(generate_base(df, "parent_text", model=model, language=language))
    if modes != []:
        dfs.append(generate_additional(df, "child_text", language=language, modes = modes))
        dfs.append(generate_additional(df, "parent_text", language=language, modes = modes))
    return pd.concat(dfs, axis = 1)

In [152]:
%%time
df = prep_dataset(df_english[:10000], model = "glove", language="english", modes = ["ner", "sentiment", "pos"])
df

Wall time: 6min 47s


Unnamed: 0,stance,child_text:PRON,child_text:ADV,child_text:ADJ,child_text:ADP,child_text:DET,child_text:AUX,child_text:VERB,child_text:NOUN,child_text:PUNCT,...,parent_text:GPE,parent_text:LOC,parent_text:PRODUCT,parent_text:EVENT,parent_text:WORK_OF_ART,parent_text:LAW,parent_text:LANGUAGE,parent_text:QUANITY,parent_text:ORDINAL,parent_text:CARDINAL
0,1,0,0,2,1,3,1,1,3,1,...,0,0,0,1,0,0,0,0,0,0
1,1,0,2,5,5,5,0,5,11,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,2,3,4,1,4,7,1,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,2,4,3,0,3,9,11,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,2,4,2,0,1,6,4,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,2,2,2,1,2,0,11,6,...,0,0,0,0,0,0,0,0,0,1
9996,0,0,0,4,1,0,0,0,6,1,...,0,0,0,0,0,0,0,0,0,0
9997,1,0,0,3,1,2,0,1,4,2,...,0,0,0,0,0,0,0,0,0,0
9998,1,0,0,2,3,1,0,0,7,4,...,0,0,0,0,0,0,0,0,0,0


In [153]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
# random state value
rsv = 42
# cpus used for training
n_jobs = -1 

models = {"SVG": SVC(probability=True,
                    random_state=rsv),
          "LogReg": LogisticRegression(random_state=rsv,
                                      n_jobs=n_jobs),
          "RanFor": RandomForestClassifier(random_state=rsv,
                                          n_jobs=n_jobs),
          "GausNB": GaussianNB(),
          "LDA": LinearDiscriminantAnalysis(),
          "KNN": KNeighborsClassifier(n_jobs=n_jobs)}

# split in training data matrix X and target y
def generate_cv_sets(df: pd.DataFrame):
    X = df.loc[:, df.columns != 'stance']
    y = df[['stance']].values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y) 
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = generate_cv_sets(df)
results = {}
for name, model in models.items():
    model.fit(X_train.fillna(0), y_train)
    score = model.score(X_test.fillna(0), y_test)
    cf_matrix = metrics.confusion_matrix(y_test, model.predict(X_test.fillna(0)))
    results[name] = {"score": score, "cfm": cf_matrix}
    models[name] = model

In [156]:
X_train, X_test, y_train, y_test = generate_cv_sets(df)
results = {}
model = LogisticRegression(random_state=42, n_jobs=-1)
model.fit(X_train.fillna(0), y_train)
score = model.score(X_test.fillna(0), y_test)
cf_matrix = metrics.confusion_matrix(y_test, model.predict(X_test.fillna(0)))
score, cf_matrix

(0.602, array([[716, 551],
        [444, 789]], dtype=int64))

In [147]:
results

{'SVG': {'score': 0.636, 'cfm': array([[159,   0],
         [ 91,   0]], dtype=int64)},
 'LogReg': {'score': 0.652, 'cfm': array([[130,  29],
         [ 58,  33]], dtype=int64)},
 'RanFor': {'score': 0.656, 'cfm': array([[137,  22],
         [ 64,  27]], dtype=int64)},
 'GausNB': {'score': 0.604, 'cfm': array([[124,  35],
         [ 64,  27]], dtype=int64)},
 'LDA': {'score': 0.6, 'cfm': array([[104,  55],
         [ 45,  46]], dtype=int64)},
 'KNN': {'score': 0.684, 'cfm': array([[138,  21],
         [ 58,  33]], dtype=int64)}}