# NLP and Feature Learning
Examples from lecture 3, Risk Analytics Workshop at the University of Ljubljana, 2-3 November 2017

Resources:
* http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
* https://rare-technologies.com/word2vec-tutorial/
* https://www.tensorflow.org/tutorials/word2vec
* https://www.kaggle.com/c/word2vec-nlp-tutorial#part-2-word-vectors
* https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb or https://github.com/munichpavel/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [3]:
#import tensorflow as tf
import os

import pandas as pd
import numpy as np

from glob import glob
from time import time
import re
from pprint import pprint

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import gensim
from gensim.models.word2vec import Word2Vec

import nltk
from bs4 import BeautifulSoup as soup

import multiprocessing

#from itertools import combinations

#import cython
#import smart_open

#from nltk.tokenize import word_tokenize
#nltk.download()

%matplotlib inline

# Example: Ne Joci Peter

## Mini example
"A si ti" vs "Ali si ti"

In [2]:
save_me = False
njp_mini = ['a', 'ali', 'si']

# Label encoding
le = LabelEncoder()
vocab_labels = le.fit_transform(njp_mini)
print("Labels: {0}".format(vocab_labels))

# One hot encoding
enc = OneHotEncoder()
mini = pd.DataFrame(enc.fit_transform([[label] for label in vocab_labels]).toarray(), index=njp_mini)
#ohp = ohp.reindex(['a', 'ali', 'si', 'ti', 'tud', 'tudi', 'not', 'noter', 'padu', 'padel'])
if save_me:
    mini.to_csv("../slides/slide_data/mini.csv")

mini

Labels: [0 1 2]


Unnamed: 0,0,1,2
a,1.0,0.0,0.0
ali,0.0,1.0,0.0
si,0.0,0.0,1.0


## Less mini example
"A si ti tud not padu" vs "Ali si ti tudi noter padel"

In [3]:
njp_contexts = ["A si ti tud not padu", "Ali si tudi noter padel"]
# Split words by spaces
njp_splits = [c.split(" ") for c in njp_contexts]
# Flatten to get vocabulary
njp_vocab = list(set([word.lower() for context in njp_splits for word in context]))
print("NJP vocabulary: {0}".format(njp_vocab))

NJP vocabulary: ['ali', 'noter', 'tud', 'ti', 'padu', 'not', 'tudi', 'padel', 'si', 'a']


### Perform label and one hot encoding

In [4]:
# Label encoding
le = LabelEncoder()
vocab_labels = le.fit_transform(njp_vocab)
print("Labels: {0}".format(vocab_labels))

# One hot encoding
enc = OneHotEncoder()
ohp = pd.DataFrame(enc.fit_transform([[label] for label in vocab_labels]).toarray(), index=njp_vocab)
ohp = ohp.reindex(['a', 'ali', 'si', 'ti', 'tud', 'tudi', 'not', 'noter', 'padu', 'padel'])
if save_me:
    ohp.to_csv("../slides/slide_data/ohp.csv")
ohp

Labels: [1 3 8 7 5 2 9 4 6 0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
a,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ali,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
si,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
tud,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
tudi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
not,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
noter,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
padu,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
padel,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Geometry of one hot encoded NJP

In [5]:
print("Inner product <a, ali>: {0}".format(np.dot(ohp.loc['a', :], ohp.loc['ali', :])))

Inner product <a, ali>: 0.0


## Bag of words, NJP

In [6]:
vectorizer = CountVectorizer(vocabulary = njp_vocab)
# Note that the count vectorizer strips the question marks
njp = ['A si ti tud not padu?', 'Ali si ti tudi noter padel?']
X = vectorizer.fit_transform(njp)
bow_njp = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names(), index=['peter', 'pravilno'])
bow_njp = bow_njp[['a', 'ali', 'si', 'ti', 'tud', 'tudi', 'not', 'noter', 'padu', 'padel']]
if save_me:
    bow_njp.to_csv('../slides/slide_data/bow_njp.csv')
bow_njp.head()

Unnamed: 0,a,ali,si,ti,tud,tudi,not,noter,padu,padel
peter,0,0,1,1,1,0,1,0,1,0
pravilno,0,1,1,1,0,1,0,1,0,1


## Problems
For background on tf-idf, see http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html or https://en.wikipedia.org/wiki/Tf–idf
* Label and one hot encode the vocabulary of "ce bi cebula ce ni imela, bi cebula bula bla"
* Calculate the BoW count vectors for the two sentences: ["ce bi cebula ce ni imela", "bi cebula bula bla"] using your encoding from the previous problem.
* Calculate the tf-idf vectors of the two sentences above. Verify two entries of your vectors by hand.

# IMDB Sentiment Analysis with Bag of Words

- 12.5k postive and 12.5k negative reviews in train set
- 12.5k postive and 12.5k negative reviews in test set
- Available at http://ai.stanford.edu/~amaas//data/sentiment/
- Citation: Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts, *Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics* (2011)


In [7]:
# First download the IMDB data from http://ai.stanford.edu/~amaas//data/sentiment/
# and put it in the data directory of your repository.
# Alternatively, you could use the code from 
# https://github.com/munichpavel/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
# but then much of the fun (i.e. text cleansing) is done for you

# Read in imdb review
imdb_dir = os.path.join("../data", "aclImdb")
train_dir = os.path.join(imdb_dir, "train")
test_dir = os.path.join(imdb_dir, "test")

train_pos_dir = os.path.join(train_dir, 'pos')
train_pos_files = glob(os.path.join(train_pos_dir, "*.txt"))

train_neg_dir = os.path.join(train_dir, 'neg')
train_neg_files = glob(os.path.join(train_neg_dir, "*.txt"))

train_unsup_dir = os.path.join(train_dir, 'unsup')

test_pos_dir = os.path.join(test_dir, 'pos')
test_pos_files = glob(os.path.join(test_pos_dir, "*.txt"))

test_neg_dir = os.path.join(test_dir, 'neg')
test_neg_files = glob(os.path.join(test_neg_dir, "*.txt"))

### Show one review

In [8]:
i=0
train_file = train_pos_files[i]
train_pos = []
with open(train_file) as f:
        for line in f:
            train_pos.append(line)
train_pos

['Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!']

## Initialize vectorizer to read from review files
Note that the token pattern argument given below is the default one, and could have been omitted. The default is

`token_pattern='(?u)\\b\\w\\w+\\b'`. 

You might want to understand what this means for one of the tutorial problems :)

Some resources:
* https://docs.python.org/3/howto/regex.html
* https://stackoverflow.com/questions/29689516/find-words-of-length-4-using-regular-expression

### Check on a small sample
Note that punctuation has been removed and all words are converted to lower case

In [9]:
# Change n_sample to more reviews
n_sample = 1
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents='unicode',
                            token_pattern='(?u)\\b\\w\\w+\\b')
vectors = vectorizer.fit_transform(train_pos_files[:n_sample])
vocab_default = vectorizer.get_feature_names()
print("Number of words in vocabulary: {}".format(len(vocab_default)))
#vocab_default

Number of words in vocabulary: 87


We don't want numbers in the review, so we specify the token pattern to only get letters

In [10]:
token_pattern = '(?u)\\b[a-z][a-z]+\\b'
#token_pattern = '(?u)[A-Za-z]*\\b\\w\\w+'
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents='unicode', 
                             token_pattern=token_pattern)
vectors = vectorizer.fit_transform(train_pos_files[:n_sample])
vocab_letters_only = vectorizer.get_feature_names()
print("Number of words in vocabulary: {}".format(len(vocab_letters_only)))
#vocab_letters_only
# See what words have been omitted
print("Words omitted: {0}".format(set(vocab_default).difference(set(vocab_letters_only))))

Number of words in vocabulary: 86
Words omitted: {'35'}


## Count vectorizer on all training reviews for BoW features

In [11]:
vectorizer = CountVectorizer(input='filename', decode_error='ignore', strip_accents='unicode',
                            token_pattern='(?u)\\b[a-z][a-z]+\\b')
bow_train = vectorizer.fit_transform(train_pos_files + train_neg_files)
vocab = vectorizer.get_feature_names()
# See word counts per review
#bow_train.toarray().sum(axis=1)

In [12]:
# Create target vectors: 1 for positive review, 0 for negative and train logistic regression classifier
y_train = np.concatenate([np.repeat(1, len(train_pos_files)), np.repeat(0, len(train_neg_files))])
clf = LogisticRegression().fit(bow_train, y_train)

## Test the classifier

In [13]:
bow_test = vectorizer.transform(test_pos_files + test_neg_files)
y_test = np.concatenate([np.repeat(1, len(test_pos_files)), np.repeat(0, len(test_neg_files))])
y_pred = clf.predict(bow_test)
y_proba = clf.predict_proba(bow_test)

In [14]:
# Look at some of these reviews
test_pos = []
for file in test_pos_files:

    with open(file) as f:
        for line in f:
            test_pos.append(line)
            
test_neg = []
for file in test_neg_files:

    with open(file) as f:
        for line in f:
            test_neg.append(line)
            

### Look at a few reviews to see how BoW does on reviews it has not yet seen

In [15]:
labels = ["Negative", "Positive"]
#i=0
i=7869 # This one the model gets wrong for positive but right for negative
i = 180 # Positive correct, negative wrong
#i = np.random.randint(len(test_neg_files))
print("File number: {}\n".format(i))
print("Positive review:\n {}".format(test_pos[i]))
#print("\nPrediction: {}\n".format(labels[y_pred[i]]))
print("\nPrediction: {0}, score: {1}\n".format(labels[y_pred[i]], y_proba[i][1]))

print("Negative review:\n {}".format(test_neg[i]))
print("\nPrediction: {0}, score: {1}".format(labels[y_pred[i+ len(test_pos_files)]], y_proba[i + len(test_pos_files)][1]))

File number: 180

Positive review:
 Dr. Ben McKenna (James Stewart) and Jo McKenna (Doris Day) travel to Morocco for a holiday where they meet a mysterious man named Louis Bernard (Daniel Gélin) on a bus.The next day this man is murdered, but before he dies he tells Ben a secret; an assassination will take place in London.The crooks kidnap the couple's son Hank (Christopher Olsen) making sure Ben won't reveal their plan to anybody.Alfred Hitchcock's The Man Who Knew Too Much (1956) is a very intense thriller.The acting is superb as it always is in Hitchcok's films.James Stewart is marvelous.Doris Day is a delightful person and actress and she gets to show her singing talents as well.The song Que Sera, Sera has an important part in the movie.This movie is a movie of many classic scenes.In the final scenes at the Albert Hall, done without dialogue, you can barely blink your eyes.This movie is fifty years old now.Time hasn't decreased its power in any way.

Prediction: Positive, score: 0.

## Evaluate via model metrics

In [16]:
print("AUC score: {}".format(roc_auc_score(y_test, y_proba[:, 1])))
print("cross entropy: {}".format(log_loss(y_test, y_proba[:, 1])))
print("Error rate: {}".format(1-accuracy_score(y_test, y_pred)))
bow_error_rate = 1-accuracy_score(y_test, y_pred)

AUC score: 0.9352565504
cross entropy: 0.4155153606134335
Error rate: 0.13327999999999995


In [17]:
save_me = False

model_coefs = pd.DataFrame({'coefficient': clf.coef_.tolist()[0], 'words': vectorizer.get_feature_names()})
pos_words = model_coefs.sort_values('coefficient', ascending=False).head(10)
neg_words = model_coefs.sort_values('coefficient', ascending=False).tail(10)
if save_me:
    pos_words.to_csv("../slides/slide_data/positive_words.csv", index=False)
    neg_words.to_csv("../slides/slide_data/negative_words.csv", index=False)
pos_words    

Unnamed: 0,coefficient,words
52771,1.596494,refreshing
71737,1.417661,wonderfully
25062,1.334485,funniest
21086,1.330959,erotic
21626,1.296048,excellent
62709,1.278945,superb
9624,1.257285,carrey
47652,1.253918,perfect
62956,1.236689,surprisingly
23670,1.232715,flawless


In [18]:
neg_words

Unnamed: 0,coefficient,words
36483,-1.487891,laughable
67907,-1.491771,unfunny
7358,-1.512002,boring
40886,-1.53786,mess
4139,-1.697967,awful
36000,-1.759309,lacks
49246,-1.806683,poorly
17396,-2.080688,disappointment
70473,-2.109686,waste
71937,-2.183291,worst


### Look at scores for individual words

In [19]:
# word = 'successful'
# word = 'painful'
# word = 'family'
# word = 'inspired'
# word = 'feelings'
# word = 'human'
# word = 'real'
# word = 'discussions'
# word = 'boring'
# word = 'strong'
# word = 'corpse'
# word = 'piano'
# word = 'gun'
# word = 'fast'
# word = 'horseback'
# word = 'great'
# word = 'bad'
# word = 'loveliest'
# word = 'marvin'
# word = 'matt'
# word = 'george'
# word = 'predictable'
# word = 'genius'
# word = 'creativity'
# word = 'excellent'
word = 'marvel'

model_coefs.coefficient[model_coefs.words == word]

39687    0.932585
Name: coefficient, dtype: float64

## Problem
* Rank the IMDB sentiments of the following nationalities: German, Slovenian, American, Italian.
* What is the difference in sentiment score in use the singular or plural of the nationalities?

# word2vec from GenSim
Skip-gram, with text processing inspired by https://www.kaggle.com/c/word2vec-nlp-tutorial#part-2-word-vectors

In [20]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

review = test_pos[1]
# Turn review into list of sentences
sentence_list = tokenizer.tokenize(review.strip())
review_lol = [re.sub("[^a-zA-Z]", " ", soup(s, "html5lib").get_text()).split() for s in sentence_list]
review_lol[:20]

[['Actor',
  'turned',
  'director',
  'Bill',
  'Paxton',
  'follows',
  'up',
  'his',
  'promising',
  'debut',
  'the',
  'Gothic',
  'horror',
  'Frailty',
  'with',
  'this',
  'family',
  'friendly',
  'sports',
  'drama',
  'about',
  'the',
  'U',
  'S',
  'Open',
  'where',
  'a',
  'young',
  'American',
  'caddy',
  'rises',
  'from',
  'his',
  'humble',
  'background',
  'to',
  'play',
  'against',
  'his',
  'Bristish',
  'idol',
  'in',
  'what',
  'was',
  'dubbed',
  'as',
  'The',
  'Greatest',
  'Game',
  'Ever',
  'Played'],
 ['I',
  'm',
  'no',
  'fan',
  'of',
  'golf',
  'and',
  'these',
  'scrappy',
  'underdog',
  'sports',
  'flicks',
  'are',
  'a',
  'dime',
  'a',
  'dozen',
  'most',
  'recently',
  'done',
  'to',
  'grand',
  'effect',
  'with',
  'Miracle',
  'and',
  'Cinderella',
  'Man',
  'but',
  'some',
  'how',
  'this',
  'film',
  'was',
  'enthralling',
  'all',
  'the',
  'same',
  'The',
  'film',
  'starts',
  'with',
  'some',
  'creat

In [21]:
class MySentences(object):
    """
    Sentence parser and iterator from file, modified from
    https://rare-technologies.com/word2vec-tutorial/
    
    My modifications:
        * Text cleansing
        * Can accept list of directories
    """
    def __init__(self, dirname, re_pattern = "[^a-zA-Z]"):
        self.dirname = dirname
        self.re_pattern = re_pattern
        
    def parse(self, raw_text):
        text = soup(raw_text, "html5lib").get_text().lower()
        return re.sub(self.re_pattern," ",text).split()
 
    def __iter__(self):
        if not isinstance(self.dirname, list):
            self.dirname = [self.dirname]
        for text_dir in self.dirname:
            for fname in os.listdir(text_dir):
                for line in open(os.path.join(text_dir, fname)):
                    yield self.parse(line)

In [22]:
sentences = MySentences([train_pos_dir, train_neg_dir, train_unsup_dir])
cores = multiprocessing.cpu_count()
assert gensim.models.word2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

%time model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=cores)
wvs = model.wv
print(len(wvs.vocab))
wvs.word_vec

CPU times: user 19min 8s, sys: 25.1 s, total: 19min 33s
Wall time: 17min 46s
47167


<bound method KeyedVectors.word_vec of <gensim.models.keyedvectors.KeyedVectors object at 0x118469048>>

## IMBD word similarity

In [23]:
occupations = {}
occupations['mathematician/cool'] = model.wv.similarity('mathematician', 'cool')
occupations['mathematician/insane'] = model.wv.similarity('mathematician', 'insane')
occupations['politician/cool'] = model.wv.similarity('politician', 'cool')
occupations['politician/crooked'] = model.wv.similarity('politician', 'crooked')

pprint(occupations)

if save_me:
    pd.Series(occupations).to_csv("../slides/slide_data/math_politics.csv",)

{'mathematician/cool': -0.11889151588692025,
 'mathematician/insane': 0.098659624566295831,
 'politician/cool': -0.10177807999433539,
 'politician/crooked': 0.63820138175875307}


In [24]:
print(model.wv.similarity('germans', 'german'), 
      model.wv.similarity('americans', 'american'), 
      model.wv.similarity('italians', 'italian'))

0.483251227961 0.539901505122 0.412879400084


## Compare BoW to doc2vec

In [25]:
if os.path.exists("../data/doc2vec_results.csv"):
    doc2vec_results = pd.read_csv("../data/doc2vec_results.csv", header=None)#, index_col=0)
    doc2vec_results.rename(columns={0: 'run', 1: "error rate"}, inplace=True)
    doc2vec_results = doc2vec_results.append([{'run': 'BoW one-hot-encoding', 'error rate': bow_error_rate}])
    if save_me:
        doc2vec_results.sort_values('error rate').to_csv("../slides/slide_data/imdb_all_results.csv", index=False)
    print(doc2vec_results.sort_values('error rate'))

## Problems

* Give a brief explanation of what cross validation is, both as a way to measure model performance and a way to improve model generalization (i.e. predictive power on data the model has never seen).
* `*` Use the cross validation version of logistic regression and test if the error rate decreases. Explain your results (general arguments enough)
* `***` Use a neural network as a sentiment classifier on the BoW count vectors and compare performance, e.g. using Keras: https://keras.io/getting-started/sequential-model-guide/