# Sentiment Analysis: to do
- Embeddings
- Lexicons
- Run demo
- Train classifiers
- Outcomes

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re
import statsmodels.formula.api
import re

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Configure how graphs will show up in this notebook
%matplotlib inline
seaborn.set_context('notebook', rc={'figure.figsize': (10, 6)}, font_scale=1.5)

In [3]:
# Import word embeddings
def load_embeddings(filename):
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')


# Call the method
embeddings = load_embeddings('glove.42B.300d.txt')
embeddings.shape

(1917494, 300)

In [4]:
# Load "state-of-the-art" Lexicon: https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
def load_lexicon(filename):
    lexicon = []
    with open(filename, encoding='latin-1') as infile:
        for line in infile:
            line = line.rstrip()
            if line and not line.startswith(';'):
                lexicon.append(line)
    return lexicon


# Call the method
pos_words = load_lexicon('positive-words.txt')
neg_words = load_lexicon('negative-words.txt')

In [5]:
# Deal with mismatches between the embedding and the lexicon that create NaN values
pos_vectors = embeddings.loc[pos_words].dropna()
neg_vectors = embeddings.loc[neg_words].dropna()

# Match sentiment to polarity
print("Polarising...")
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

print("Splitting...")
# Split data
train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)

Polarising...
Splitting...


Now we make our classifier, and train it by running the training vectors through it for 100 iterations. We use a logistic function as the loss, so that the resulting classifier can output the probability that a word is positive or negative.

In [6]:
# Classification time!
from sklearn.linear_model import LogisticRegression
lgClassifier = LogisticRegression(solver='lbfgs')
lgClassifier.fit(train_vectors, train_targets)

# Score for out-of-training data
accuracy_score(lgClassifier.predict(test_vectors), test_targets)

0.947209653092006

In [7]:
# Classification time!
from sklearn.naive_bayes import MultinomialNB
lgClassifier = MultinomialNB() 
lgClassifier.fit(train_vectors, train_targets)

# Score for out-of-training data
accuracy_score(lgClassifier.predict(test_vectors), test_targets)

0.9502262443438914

In [8]:
# predict_log_proba gives the log probability for each class
def vecs_to_sentiment(vecs):
    
    predictions = lgClassifier.predict_log_proba(vecs)

    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


# log-prob for specific words or sentences
def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)


# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.
TOKEN_RE = re.compile(r"\w.*?\b")

# Combine sentiments for word vectors into an overall sentiment scoreby averaging them.
def text_to_sentiment(text):
    tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
    sentiments = words_to_sentiment(tokens)
    return sentiments['sentiment'].mean()

## Other approaches

This is of course only one way to do sentiment analysis. All the steps we used are common, but you probably object that you wouldn't do it that way. But if you have your own process, I urge you to see if your process is encoding prejudices and biases in the model it learns.

Instead of or in addition to changing your source of word vectors, you could try to fix this problem in the output directly. It may help, for example, to build a stronger model of whether sentiment should be assigned to words at all, designed to specifically exclude names and groups of people.

You could abandon the idea of inferring sentiment for words, and only count the sentiment of words that appear exactly in the list. This is perhaps the most common form of sentiment analysis -- the kind that includes no machine learning at all. Its results will be no more biased than whoever made the list. But the lack of machine learning means that this approach has low recall, and the only way to adapt it to your data set is to edit the list manually.

As a hybrid approach, you could produce a large number of inferred sentiments for words, and have a human annotator patiently look through them, making a list of exceptions whose sentiment should be set to 0. The downside of this is that it's extra work; the upside is that you take the time to actually see what your data is doing. And that's something that I think should happen more often in machine learning anyway.


In [9]:
text_to_sentiment("My name is Ahmed")

1.6016286987257962

---------

In [10]:
# Importing data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

with open('FINAL-OUTPUT.txt') as f:
    full_data = [line.rstrip('\n') for line in f]
with open('MEN-OUTPUT.txt') as f:
    man_data = [line.rstrip('\n') for line in f]
with open('WOMEN-OUTPUT.txt') as f:
    woman_data = [line.rstrip('\n') for line in f]
print(len(full_data))
print(len(man_data))
print(len(woman_data))
print(len(woman_data) + len(man_data))

1572904
809290
763614
1572904


In [11]:
# Organise data
import json

gender = []
body = []
subreddit = []
for line in full_data:
    temp = json.loads(line)
    body.append(temp['body'])
    
    if (temp['author_flair_css_class'] == 'male'):
        gender.append(1)
    else:
        gender.append(0)
        
    if (temp['subreddit'] == 'AskMen'):
        subreddit.append('AM')
    else:
        subreddit.append('AW')

In [12]:
sentiment = []
sentimentOfAM = []
sentimentOfAW = []
sentimentOfMen = []
sentimentOfWomen = []

for i, item in enumerate(body):
    try:
        sentiment.append(text_to_sentiment(item))  
        
        if subreddit[i] == 'AM':
            sentimentOfAM.append(sentiment[i])
        else:
            sentimentOfAW.append(sentiment[i])
            
        if gender[i] == 1:
            sentimentOfMen.append(sentiment[i])
        else:
            sentimentOfWomen.append(sentiment[i])
    except:
        sentiment.append(0)
        #print("offie owwie I appended a fake numberie at", len(sentiment)) 

In [13]:
print(len(sentiment))
print(len(sentimentOfAM))
print(len(sentimentOfAW))
print(len(sentimentOfMen))
print(len(sentimentOfWomen))

1572904
808375
762763
862456
708682


In [None]:
print(len(sentiment))
print(len(sentimentOfAM))
print(len(sentimentOfAW))
print(len(sentimentOfMen))
print(len(sentimentOfWomen))

In [14]:
general = sum(sentiment)/len(sentiment)
print("General:", general)

AskMenSent = sum(sentimentOfAM)/len(sentimentOfAM)
print("AskMenSent:", AskMenSent)

AskWomenSent = sum(sentimentOfAW)/len(sentimentOfAW)
print("AskWomenSent:", AskWomenSent)

MenSent = sum(sentimentOfMen)/len(sentimentOfMen)
print("MenSent:", MenSent)

WomenSent = sum(sentimentOfWomen)/len(sentimentOfWomen)
print("WomenSent:", WomenSent)
    

General: 0.9796917245649305
AskMenSent: 0.9593854818054988
AskWomenSent: 1.0034804958820738
MenSent: 0.9566705501064069
WomenSent: 1.0101495118719404


---------