In [2]:
import pandas as pd
import scipy
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
import numpy as np
from sklearn.metrics import accuracy_score

In [3]:
path_text = "../datasets/stance/{}/train_text.txt"
path_labels = "../datasets/stance/{}/train_labels.txt"

text = pd.read_csv(path_text.format("hillary"), names=["text"], delimiter="\n")
labels = pd.read_csv(path_labels.format("hillary"), names=["labels"], delimiter="\n")
hillary = pd.concat([text, labels], axis=1)

In [4]:
text = pd.read_csv(path_text.format("abortion"), names=["text"], delimiter="\n")
labels = pd.read_csv(path_labels.format("abortion"), names=["labels"], delimiter="\n")
abortion = pd.concat([text, labels], axis=1)
abortion["labels"] +=3

In [5]:
text = pd.read_csv(path_text.format("atheism"), names=["text"], delimiter="\n")
labels = pd.read_csv(path_labels.format("atheism"), names=["labels"], delimiter="\n")
atheism = pd.concat([text, labels], axis=1)
atheism["labels"] +=6

In [6]:
text = pd.read_csv(path_text.format("climate"), names=["text"], delimiter="\n")
labels = pd.read_csv(path_labels.format("climate"), names=["labels"], delimiter="\n")
climate = pd.concat([text, labels], axis=1)
climate["labels"] +=9

In [7]:
text = pd.read_csv(path_text.format("feminist"), names=["text"], delimiter="\n")
labels = pd.read_csv(path_labels.format("feminist"), names=["labels"], delimiter="\n")
feminist = pd.concat([text, labels], axis=1)
feminist["labels"] +=12

In [8]:
def tokenize_meaning(line):
    tokens = []
    unmatchables = []
    
    for word in line.split():
        if re.findall(r"\w+-\w+|\w+'\w+|\w+|[&?!…]+", word) != []:
            x = re.findall(r"\w+-\w+|\w+'\w+|\w+|[&?!…]+", word)
            for element in x:
                if element == "âž" or element == "ðŸ":
                    continue
                elif element == "&":
                    tokens.append("and")
                else:
                    tokens.append(element.lower())

        if re.findall(r"\w+-\w+|\w+'\w+|\w+|[&?!…]", word) != [word] and re.findall(r"[^\w|&!?…]+", word) != []:
            unmatchables.append(re.findall(r"[^\w|!?…&]+", word)[0])

    s = ""
    for token in tokens:
        s = s + " " + token
    return (s, unmatchables, tokens)

In [9]:
val_text = "../datasets/stance/{}/val_text.txt"
val_labels = "../datasets/stance/{}/val_labels.txt"

text = pd.read_csv(val_text.format("hillary"), names=["text"], delimiter="\n")
labels = pd.read_csv(val_labels.format("hillary"), names=["labels"], delimiter="\n")
hillary_val = pd.concat([text, labels], axis=1)

In [11]:
text = pd.read_csv(val_text.format("abortion"), names=["text"], delimiter="\n")
labels = pd.read_csv(val_labels.format("abortion"), names=["labels"], delimiter="\n")
abortion_val = pd.concat([text, labels], axis=1)
abortion_val["labels"] +=3

In [12]:
text = pd.read_csv(val_text.format("atheism"), names=["text"], delimiter="\n")
labels = pd.read_csv(val_labels.format("atheism"), names=["labels"], delimiter="\n")
atheism_val = pd.concat([text, labels], axis=1)
atheism_val["labels"] +=6

In [13]:
text = pd.read_csv(val_text.format("climate"), names=["text"], delimiter="\n")
labels = pd.read_csv(val_labels.format("climate"), names=["labels"], delimiter="\n")
climate_val = pd.concat([text, labels], axis=1)
climate_val["labels"] +=9

In [14]:
text = pd.read_csv(val_text.format("feminist"), names=["text"], delimiter="\n")
labels = pd.read_csv(val_labels.format("feminist"), names=["labels"], delimiter="\n")
feminist_val = pd.concat([text, labels], axis=1)
feminist_val["labels"] +=12

In [16]:
combined_train

Unnamed: 0,text,labels
0,If a man demanded staff to get him an ice tea ...,1
1,"We're out here in G-town, and where are you #...",0
2,If you're not watching @user speech right now ...,2
3,How can she live with herself? #Benghazi #SemST,1
4,Jimmy Fallon music playing. Thank you .... #Do...,0
...,...,...
592,The only thing ugly is the god damn opinions o...,12
593,The lack intellectual integrity in a group of ...,13
594,What a real #Patriarchy and #RapeCulture looks...,13
595,"Women are shamed if they don't want children, ...",14


In [15]:
combined_train = pd.concat([hillary, abortion, atheism, climate, feminist])
combined_val = pd.concat([hillary_val, abortion_val, atheism_val, climate_val, feminist_val])

In [129]:
stance_train_vocab = []
stance_train_list = []
stance_val_vocab = []

for i in combined_train["text"]:
    x = tokenize_meaning(i)
    stance_train_vocab.append(x[0])
    stance_train_list.append(x[2])

for i in combined_val["text"]:
    x = tokenize_meaning(i)
    stance_val_vocab.append(x[0])

In [91]:
mapping = {}
mapping[0]  = "none hillary"
mapping[1]  = "against hillary"
mapping[2]  = "favor hillary"
mapping[3]  = "none abortion"
mapping[4]  = "against abortion"
mapping[5]  = "favor abortion"
mapping[6]  = "none atheism"
mapping[7]  = "against atheism"
mapping[8]  = "favor atheism"
mapping[9]  = "none climate"
mapping[10] = "against climate"
mapping[11] = "favor climate"
mapping[12] = "none feminist"
mapping[13] = "against feminist"
mapping[14] = "favor feminist"

In [135]:
stance_forest = Pipeline([('vect', CountVectorizer(analyzer='word', ngram_range=(1, 1))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
stance_forest.fit(stance_train_vocab, combined_train["labels"]);

In [142]:
predicted = stance_forest.predict(combined_val["text"])
accuracy_score(predicted, combined_val["labels"])

0.47278911564625853

In [159]:
combined_val["labels"].value_counts()

1     39
4     36
13    33
7     31
11    21
14    21
0     18
3     18
9     17
12    13
2     12
5     12
6     12
8      9
10     2
Name: labels, dtype: int64

In [153]:
predicted = stance_forest.predict(hillary_val["text"])
accuracy_score(predicted, hillar_val["labels"])

0.4492753623188406

In [154]:
predicted = stance_forest.predict(abortion_val["text"])
accuracy_score(predicted, abortion_val["labels"])

0.4696969696969697

In [155]:
predicted = stance_forest.predict(atheism_val["text"])
accuracy_score(predicted, atheism_val["labels"])

0.5192307692307693

In [156]:
predicted = stance_forest.predict(climate_val["text"])
accuracy_score(predicted, climate_val["labels"])

0.5

In [157]:
predicted = stance_forest.predict(feminist_val["text"])
accuracy_score(predicted, feminist_val["labels"])

0.44776119402985076

In [149]:
hillary_train_vocab = []
hillary_train_list = []
hillary_val_vocab = []

for i in hillary["text"]:
    x = tokenize_meaning(i)
    hillary_train_vocab.append(x[0])
    hillary_train_list.append(x[2])

for i in hillary_val["text"]:
    x = tokenize_meaning(i)
    hillary_val_vocab.append(x[0])

In [151]:
hillary_forest = Pipeline([('vect', CountVectorizer(analyzer='word', ngram_range=(1, 1))), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier())])
hillary_forest.fit(hillary_train_vocab, hillary["labels"]);

In [152]:
predicted = hillary_forest.predict(hillary_val["text"])
accuracy_score(predicted, hillary_val["labels"])

0.5797101449275363