Team 10 NLP Project

Dataset: One Million Post Corpus
Dataset Language: German

Goal: Classify and predict User-Post catgeory


In [1]:
import sqlite3
import pandas as pd
# Create your connection.
cnx = sqlite3.connect('corpus.sqlite3')

df = pd.read_sql_query("SELECT * FROM Annotations", cnx)
print(df.head())

df2 = pd.read_sql_query("SELECT * FROM Annotations_consolidated", cnx)
print(df2.head())

df3 = pd.read_sql_query("SELECT * FROM Articles", cnx)
print(df3.head())

df4 = pd.read_sql_query("SELECT * FROM Categories", cnx)
print(df4.head())

df5 = pd.read_sql_query("SELECT * FROM CrossValSplit", cnx)
print(df5.head())

df6 = pd.read_sql_query("SELECT * FROM Newspaper_Staff", cnx)
print(df6.head())

df7 = pd.read_sql_query("SELECT * FROM Posts", cnx)
print(df7.head())

   ID_Post  ID_Annotator         Category  Value
0     3326             1    ArgumentsUsed      0
1     3326             1   Discriminating      0
2     3326             1    Inappropriate      0
3     3326             1         OffTopic      0
4     3326             1  PersonalStories      0
   ID_Post          Category  Value
0       79  PossiblyFeedback      1
1       81  PossiblyFeedback      1
2      132  PossiblyFeedback      1
3      134  PossiblyFeedback      1
4      139  PossiblyFeedback      1
   ID_Article                              Path          publishingDate  \
0           1           Newsroom/User/Community  2012-05-26 03:00:19.23   
1           2    Newsroom/User/Community/Regeln  2012-05-26 12:12:19.46   
2           3                    Diverses/mobil  2013-11-22 12:15:00.00   
3           4  Newsroom/User/mitmachen/Mitreden  2014-08-13 05:30:00.00   
4           5  Newsroom/User/mitmachen/Mitreden  2014-08-27 12:27:01.09   

                                       

In [4]:
##############################################################
################### Create Posts_Annotated ###################
##############################################################
import sqlite3

def getData(db_name):
    # open and (if not exists) create database file
    con = sqlite3.connect(db_name)
    cur = con.cursor()

    # create table
    cur.execute("CREATE TABLE IF NOT EXISTS Posts_Annotated (" +
                "ID_Post           INTEGER PRIMARY KEY," +
                "Status            INTEGER," +
                "Headline          TEXT," +
                "Body              TEXT," +
                "PositiveVotes     INTEGER," +
                "NegativeVotes     INTEGER," +
                "ArgumentsUsed     INTEGER," +
                "Discriminating    INTEGER," +
                "Inappropriate     INTEGER," +
                "OffTopic          INTEGER," +
                "PersonalStories   INTEGER," +
                "PossiblyFeedback  INTEGER," +
                "SentimentNegative INTEGER," +
                "SentimentNeutral  INTEGER," +
                "SentimentPositive INTEGER" +
                ");")
    
    # check if we need to add data
    cur.execute('SELECT Count(*) FROM Posts_Annotated')
    count = cur.fetchall()
    if(count[0][0] > 1):
        return

    # insert statement
    insert_table = "INSERT INTO Posts_Annotated ( ID_Post, Status, Headline, Body, PositiveVotes, NegativeVotes, ArgumentsUsed, Discriminating, Inappropriate, OffTopic, PersonalStories, PossiblyFeedback, SentimentNegative, SentimentNeutral, SentimentPositive ) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )"

    # select annotation
    get_annotation = "SELECT Category, Value FROM Annotations_consolidated WHERE ID_Post = ? ORDER BY Category"

    # get all posts we need
    cur.execute('SELECT ID_Post, ' +
                'Status, ' +
                'Headline, ' +
                'Body, ' +
                'PositiveVotes, ' +
                'NegativeVotes ' +
                'FROM ( ' +
                'SELECT * ' +
                'FROM Posts ' +
                'WHERE ID_Post IN ( ' +
                'SELECT ID_Post ' +
                'FROM Annotations_consolidated ' +
                'WHERE Category = "ArgumentsUsed"));')

    rows = cur.fetchall()

    for row in rows:
        next_row = []

        for row_data in row:
            next_row.append(row_data)

        # get annotations
        cur.execute(get_annotation, [row[0]])
        annotations = cur.fetchall()
        for annotation in annotations:
            next_row.append(annotation[1])

        cur.execute(insert_table, next_row)

    # Save (commit) the changes
    con.commit()

    # close the database connection
    con.close()

getData("corpus.sqlite3")

In [6]:
##############################################################
############## Create Posts_Annotated_combined ###############
##############################################################
import sqlite3
import pandas as pd
import numpy as np
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc

def remove_punct_and_numbers(corpus):
    #code from https://stackoverflow.com/questions/45375488/how-to-filter-tokens-from-spacy-document
    global nlp
    indexes = []
    doc = nlp(corpus)
    for index, token in enumerate(doc):
        if (token.pos_  in ('PUNCT', 'NUM', 'SYM')):
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    return doc2

def combine_text(headline, body):
    if(headline is None):
        if(body is None):
            return ""
        else:
            #return remove_punct_and_numbers(body)
            return "".join(token.text_with_ws for token in remove_punct_and_numbers(body))
    else:
        if(body is None):
            #return remove_punct_and_numbers(headline)
            return "".join(token.text_with_ws for token in remove_punct_and_numbers(headline))
        else:
            #drei pünktchen entfernen und text ohne punkt 
            if headline[-3:] == "...":
                headline = headline[:-3]
            if body[:3] == "...":
                body = body[3:]
            #alternativ auch manchmal zwei pünktchen
            if headline[-3:] == "..":
                headline = headline[:-3]
            if body[:3] == "..":
                body = body[3:]
            
            #add missing space, to avoid combining words
            if headline[-1:] != " ":
                if body[:1] != " ":
                    headline += " "
            else:
                if body[:1] == " ":
                    body = body[1:]
                    
            return "".join(token.text_with_ws for token in remove_punct_and_numbers(headline + body))

con = sqlite3.connect('corpus.sqlite3')
Posts_Annotated_df = pd.read_sql_query("SELECT * FROM Posts_Annotated", con)
nlp = spacy.load("de_core_news_lg")
Posts_Annotated_df.insert(2, "combined_text", Posts_Annotated_df.apply(lambda x: combine_text(x["Headline"], x["Body"]), axis=1))
Posts_Annotated_df.drop(columns=['Headline', 'Body'], inplace=True)
Posts_Annotated_df.to_sql("Posts_Annotated_combined", con, if_exists='fail')

   ID_Post  Status                                      combined_text  \
0     3326  online       Top qualifizierte Leute verdienen auch viel    
1     5321  online  Gott sei dank ist für sie eine Umfrage alles a...   
2     5590  online  Die FPÖ wird aus allen Rohren schießen und die...   
3     6015  online  Weil es dein meisten Leuten verständlicherweis...   
4     8213  online                Na wer weis was da vorgefallen ist    

   PositiveVotes  NegativeVotes  ArgumentsUsed  Discriminating  Inappropriate  \
0              1              3              0               0              0   
1              2              1              1               0              0   
2              7              1              1               0              0   
3              2              0              0               0              1   
4              0              0              0               0              0   

   OffTopic  PersonalStories  PossiblyFeedback  SentimentNegative  \
0    

In [2]:
##############################################################
################# Create Dataframe for Test ##################
##############################################################
import sqlite3
import pandas as pd
import numpy as np
import spacy

con = sqlite3.connect('corpus.sqlite3')
Posts_Annotated_df = pd.read_sql_query("SELECT * FROM Posts_Annotated_combined", con)
nlp = spacy.load("de_core_news_lg")
shape = nlp("test")

#create vector representation for all inputs
# Posts
combined_text = np.empty((len(Posts_Annotated_df["combined_text"]), shape.vector.shape[0]))
pos = 0
for text in Posts_Annotated_df["combined_text"]:
    combined_text[pos] = nlp(text).vector
    pos += 1

'\n# ArgumentsUsed\nArgumentsUsed = np.empty((len(Posts_Annotated_df["ArgumentsUsed"]), shape.vector.shape[0]))\npos = 0\nfor text in Posts_Annotated_df["ArgumentsUsed"]:\n    combined_text[pos] = nlp(text).vector\n    pos += 1\n    \n# Discriminating\nDiscriminating = np.empty((len(Posts_Annotated_df["Discriminating"]), shape.vector.shape[0]))\npos = 0\nfor text in Posts_Annotated_df["Discriminating"]:\n    combined_text[pos] = nlp(text).vector\n    pos += 1\n\n# Inappropriate\nInappropriate = np.empty((len(Posts_Annotated_df["Inappropriate"]), shape.vector.shape[0]))\npos = 0\nfor text in Posts_Annotated_df["Inappropriate"]:\n    combined_text[pos] = nlp(text).vector\n    pos += 1\n\n# OffTopic\nOffTopic = np.empty((len(Posts_Annotated_df["OffTopic"]), shape.vector.shape[0]))\npos = 0\nfor text in Posts_Annotated_df["OffTopic"]:\n    combined_text[pos] = nlp(text).vector\n    pos += 1\n\n# PersonalStories\nPersonalStories = np.empty((len(Posts_Annotated_df["PersonalStories"]), shape.

In [15]:
##############################################################
#################### Test Classification #####################
##############################################################
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
        
def trainModel(text_vector, label, act="relu", solv="adam", l_rate="constant", l_rate_init=0.001, mom=0.9, iter_change=10):
    text_train, text_test, label_train, label_test = train_test_split(text_vector, label, test_size=0.30, random_state=101, shuffle=True)

    # Model Training (MLPClassifier)
    mlp_classifier = MLPClassifier(random_state=1, max_iter=5000, activation=act, solver=solv, learning_rate=l_rate, learning_rate_init=l_rate_init, momentum=mom, n_iter_no_change=iter_change) #increase iter
    mlp_classifier.fit(text_train, label_train)

    # Testing
    test = mlp_classifier.predict(text_test)

    # Evaluation
    print("Accuracy:", metrics.accuracy_score(label_test.to_numpy(), test) * 100)
    print("Recall:", metrics.recall_score(label_test.to_numpy(), test, average="binary", pos_label=1) * 100)
    print("Precision:", metrics.precision_score(label_test.to_numpy(), test, average="binary", pos_label=1) * 100)
    print("F1-Score:", metrics.f1_score(label_test.to_numpy(), test, average="binary", pos_label=1) * 100)
    print("Confusion Matrix:", metrics.confusion_matrix(label_test.to_numpy(), test))

#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "identity", "lbfgs")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "identity", "sgd")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "identity", "adam")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "logistic", "lbfgs")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "logistic", "sgd")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "logistic", "adam")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "tanh", "lbfgs")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "tanh", "sgd")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "tanh", "adam")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "lbfgs")
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.001) #76,667
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.002) #76,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.003) #77,685
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.01) #77,314
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.02) #76,203
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.03) #76,667
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.1) #76,296
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.2) #73,703
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "constant", 0.3) #73,703
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.001) #65,462
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.002) #68,056
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.003) #70,277
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.01) #75,556
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.02) #76,203
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.03) #76,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.1) #77,592
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.2) #76,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "invscaling", 0.3) #74,444
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.001) #76,667
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.002) #76,574
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.003) #77,5
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.01) #77,685
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.02) #76,296
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.03) #76,111
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.04) #76,203
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.05) #77,96
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06) #77,96
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.1) #76,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.2) #76,481
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.3) #76,481
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.4) #77,870
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.5) #76,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.6) #75,833
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.7) #76,944
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.8) #76,759
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9) #77,962
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 11) #77,962
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 15) #78,240
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 20) #78,240 <=== best
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 50) #78,240
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 100) #78,240
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.07) #77,129
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.08) #76,296
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.09) #77,037
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.1) #77,777
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.11) #75,277
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.2) #73,703
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.3) #71,389
#trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "adam")

trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["Discriminating"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["Inappropriate"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["OffTopic"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["PersonalStories"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["PossiblyFeedback"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["SentimentNegative"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["SentimentNeutral"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
trainModel(combined_text, Posts_Annotated_df["SentimentPositive"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)

Accuracy: 59.53703703703703
Recall: 63.01115241635687
Precision: 58.752166377816295
F1-Score: 60.80717488789238
Confusion Matrix: [[304 238]
 [199 339]]
Accuracy: 98.24074074074073
Recall: 0.0
Precision: 0.0
F1-Score: 0.0
Confusion Matrix: [[1061    5]
 [  14    0]]
Accuracy: 60.18518518518518
Recall: 65.79925650557621
Precision: 59.0
F1-Score: 62.214411247803156
Confusion Matrix: [[296 246]
 [184 354]]
Accuracy: 98.42592592592592
Recall: 0.0
Precision: 0.0
F1-Score: 0.0
Confusion Matrix: [[1063    3]
 [  14    0]]


In [10]:
##############################################################
###################### Classify Posts ########################
##############################################################
import sqlite3
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc

def trainModel(text_vector, label, act="relu", solv="adam", l_rate="constant", l_rate_init=0.001, mom=0.9, iter_change=10):
    # Model Training (MLPClassifier)
    mlp_classifier = MLPClassifier(random_state=1, max_iter=5000, activation=act, solver=solv, learning_rate=l_rate, learning_rate_init=l_rate_init, momentum=mom, n_iter_no_change=iter_change) #increase iter
    mlp_classifier.fit(text_vector, label)
    return mlp_classifier

def remove_punct_and_numbers(corpus):
    #code from https://stackoverflow.com/questions/45375488/how-to-filter-tokens-from-spacy-document
    global nlp
    indexes = []
    doc = nlp(corpus)
    for index, token in enumerate(doc):
        if (token.pos_  in ('PUNCT', 'NUM', 'SYM')):
            indexes.append(index)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = np.delete(np_array, indexes, axis = 0)
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    return doc2

def combine_text(headline, body):
    if(headline is None):
        if(body is None):
            return ""
        else:
            return body
    else:
        if(body is None):
            return headline
        else:
            #drei pünktchen entfernen und text ohne punkt 
            if headline[-3:] == "...":
                headline = headline[:-3]
            if body[:3] == "...":
                body = body[3:]
            #alternativ auch manchmal zwei pünktchen
            if headline[-3:] == "..":
                headline = headline[:-3]
            if body[:3] == "..":
                body = body[3:]
            
            #add missing space, to avoid combining words
            if headline[-1:] != " ":
                if body[:1] != " ":
                    headline += " "
            else:
                if body[:1] == " ":
                    body = body[1:]
            
            return headline + body

con = sqlite3.connect('corpus.sqlite3')
Posts_Annotated_df = pd.read_sql_query("SELECT * FROM Posts_Annotated_combined", con)
nlp = spacy.load("de_core_news_lg")
shape = nlp("test")

#create vector representation for all inputs
# Posts
print("create Training data")
combined_text = np.empty((len(Posts_Annotated_df["combined_text"]), shape.vector.shape[0]))
pos = 0
for text in Posts_Annotated_df["combined_text"]:
    combined_text[pos] = nlp(text).vector
    pos += 1

#erstelle classifier
print("create Classifier")
ArgumentsUsed = trainModel(combined_text, Posts_Annotated_df["ArgumentsUsed"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
Discriminating = trainModel(combined_text, Posts_Annotated_df["Discriminating"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
Inappropriate = trainModel(combined_text, Posts_Annotated_df["Inappropriate"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
OffTopic = trainModel(combined_text, Posts_Annotated_df["OffTopic"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
PersonalStories = trainModel(combined_text, Posts_Annotated_df["PersonalStories"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
PossiblyFeedback = trainModel(combined_text, Posts_Annotated_df["PossiblyFeedback"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
SentimentNegative = trainModel(combined_text, Posts_Annotated_df["SentimentNegative"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
SentimentNeutral = trainModel(combined_text, Posts_Annotated_df["SentimentNeutral"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)
SentimentPositive = trainModel(combined_text, Posts_Annotated_df["SentimentPositive"], "relu", "sgd", "adaptive", 0.06, 0.9, 20)

#lade posts and combine text
print("Load Posts")
Posts_df = pd.read_sql_query("SELECT ID_Post, Status, Headline, Body, PositiveVotes, NegativeVotes FROM Posts WHERE ID_Post NOT IN (SELECT ID_Post FROM Posts_Annotated)", con)
Posts_df.insert(2, "combined_text", Posts_df.apply(lambda x: combine_text(x["Headline"], x["Body"]), axis=1))
Posts_df.drop(columns=['Headline', 'Body'], inplace=True)

# insert statement
insert_table = "INSERT INTO Posts_Annotated_combined ( ID_Post, Status, combined_text, PositiveVotes, NegativeVotes, ArgumentsUsed, Discriminating, Inappropriate, OffTopic, PersonalStories, PossiblyFeedback, SentimentNegative, SentimentNeutral, SentimentPositive ) values ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )"

#iteriere über posts
print("start processing posts")
cur = con.cursor()
for index, row in Posts_df.iterrows():
    if (index % 10000) == 0:
        # Save (commit) the changes
        con.commit()
        # close the database connection
        con.close()
        con = sqlite3.connect('corpus.sqlite3')
        cur = con.cursor()
        print(index/10000)

    #text to vector
    doc = remove_punct_and_numbers(row["combined_text"])
    text_vec = doc.vector
    text_cleaned = "".join(token.text_with_ws for token in doc)

    #use classifier
    ArgumentsUsedType = ArgumentsUsed.predict([text_vec])
    DiscriminatingType = Discriminating.predict([text_vec])
    InappropriateType = Inappropriate.predict([text_vec])
    OffTopicType = OffTopic.predict([text_vec])
    PersonalStoriesType = PersonalStories.predict([text_vec])
    PossiblyFeedbackType = PossiblyFeedback.predict([text_vec])
    SentimentNegativeType = SentimentNegative.predict([text_vec])
    SentimentNeutralType = SentimentNeutral.predict([text_vec])
    SentimentPositiveType = SentimentPositive.predict([text_vec])

    cur.execute(insert_table, [row['ID_Post'], row['Status'], text_cleaned, row['PositiveVotes'], row['NegativeVotes'], ArgumentsUsedType[0], DiscriminatingType[0], InappropriateType[0], OffTopicType[0], PersonalStoriesType[0], PossiblyFeedbackType[0], SentimentNegativeType[0], SentimentNeutralType[0], SentimentPositiveType[0]])

# Save (commit) the changes
con.commit()

# close the database connection
con.close()

create Training data
create Classifier
Load Posts
start processing posts
0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
20.0
21.0
22.0
23.0
24.0
25.0
26.0
27.0
28.0
29.0
30.0
31.0
32.0
33.0
34.0
35.0
36.0
37.0
38.0
39.0
40.0
41.0
42.0
43.0
44.0
45.0
46.0
47.0
48.0
49.0
50.0
51.0
52.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
62.0
63.0
64.0
65.0
66.0
67.0
68.0
69.0
70.0
71.0
72.0
73.0
74.0
75.0
76.0
77.0
78.0
79.0
80.0
81.0
82.0
83.0
84.0
85.0
86.0
87.0
88.0
89.0
90.0
91.0
92.0
93.0
94.0
95.0
96.0
97.0
98.0
99.0
100.0
