In [2]:
import nltk
import pandas as pd

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from toolbox import ToolBox
from textblob import TextBlob

# nltk.download('vader_lexicon')

In [3]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores('This is not super fun')

{'neg': 0.661, 'neu': 0.339, 'pos': 0.0, 'compound': -0.7048}

In [4]:
tb = ToolBox()

In [5]:
reviews = tb.load_data_sql(table='user_reviews', where="lang = 'en'", limit=1000)
reviews.head()

Loaded from cache


Unnamed: 0,game,username,date,grade,review,lang,helpful_nb,helpful_nb_total
0,007-legends-pc,Bughyman1000,2013-01-19,30,"Oh, my dear God! What is it with developers th...",en,0,0
1,007-legends-pc,DiabloZiri,2014-06-23,30,Seriously if you want to have a HUGE laugh wit...,en,0,0
2,007-legends-pc,DrugsMeazureTim,2014-12-11,0,godawful port of the xbox 360 version and ps3....,en,0,0
3,007-legends-pc,DustEater,2012-11-03,0,Agree. Worst game ever. Its a full copy of Cal...,en,8,8
4,007-legends-pc,evry1isacritic,2012-11-04,0,DO NOT BUY THIS GAME for the PC (or for any ot...,en,2,2


In [6]:
def vader_sentiment_to_df(reviews):
    '''
        Gets the sentiment scores of reviews using nltk.sentiment.vader.
    '''
    sid = SentimentIntensityAnalyzer()

    negative = []
    neutral = []
    positive = []
    compound = []

    for review in reviews:
        scores = list(sid.polarity_scores(review).values())

        neg, neu, pos, comp = (scores[0], scores[1], scores[2], scores[3])

        negative.append(neg)
        neutral.append(neu)
        positive.append(pos)
        compound.append(comp)

    return pd.DataFrame.from_dict({'negative':negative, 'neutral':neutral, 'positive':positive, 'compound':compound})


In [7]:
def TB_sentiment_to_df(reviews):
    '''
    Computes the polarity and subjectivity scores of reviews using TextBlob and returns a dataframe.
    '''
    polarity = []
    subjectivity = []

    for review in reviews:
        scores = TextBlob(review).sentiment

        polarity.append(scores.polarity)
        subjectivity.append(scores.subjectivity)

    return pd.DataFrame.from_dict({'TB-polarity':polarity, 'TB-subjectivity': subjectivity})

In [8]:
vader_sentiment_df = vader_sentiment_to_df(reviews['review'])

TB_sentiment_df = TB_sentiment_to_df(reviews['review'])

vader_sentiment_df.shape, TB_sentiment_df.shape

((1000, 4), (1000, 2))

In [9]:
sentiment_df = pd.concat([vader_sentiment_df, TB_sentiment_df], axis = 1)
sentiment_df.head()

Unnamed: 0,negative,neutral,positive,compound,TB-polarity,TB-subjectivity
0,0.156,0.767,0.077,-0.9191,-0.132968,0.50303
1,0.053,0.402,0.545,0.9424,0.048333,0.433333
2,0.078,0.874,0.048,-0.3869,-0.5,0.6
3,0.143,0.654,0.204,0.5496,-0.126389,0.426389
4,0.13,0.823,0.047,-0.9612,-0.289268,0.430177


In [12]:
for review in reviews['review'][0:5]:
    print(review)
    print('')

Oh, my dear God! What is it with developers these days? i understand the fact that almost all licensed games are awful, but.. really, that awful?! It's really a torment, playing through this game. i only finished it because I didn't want to stop at the second mission. I might say I have S&amp;M tendencies, for putting myself through this. First: The graphics strike you as being 2004-ish (at best) and I don't wanna hear anyone say "it's a console title". That doesn't matter, there are so many other well-made games. It has no adjustable settings, besides the resolution!  The sound - horrible, The controls - outrageous (without gamepad compatibility - how is this a console port if it doesn't even have gamepad compatibility?!) The acting - pure mockery..Stupid scripting often leave the characters repeating the same line over and over again, like babbling idiots. The Gadgets and gameplay opportunities - completely wasted. I can't find one good thing to say about this; even the main menu loo

In [112]:
sentiment_df.corr()

Unnamed: 0,negative,neutral,positive,compound,TB-polarity,TB-subjectivity
negative,1.0,-0.28645,-0.454577,-0.744712,-0.622535,0.155389
neutral,-0.28645,1.0,-0.723159,-0.092881,-0.154158,-0.348041
positive,-0.454577,-0.723159,1.0,0.623224,0.592198,0.211519
compound,-0.744712,-0.092881,0.623224,1.0,0.585304,-0.02243
TB-polarity,-0.622535,-0.154158,0.592198,0.585304,1.0,0.058578
TB-subjectivity,0.155389,-0.348041,0.211519,-0.02243,0.058578,1.0


In [49]:
import flair
from flair.data import Sentence

flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

2019-10-02 14:28:03,804 loading file /Users/Jules/.flair/models/imdb-v0.4.pt


In [88]:
review = reviews[reviews['grade'] > 90].iloc[12]['review']

print(review)

s = flair.data.Sentence(review)
flair_sentiment.predict(s)

The only reason why we didnâ<U+0080><U+0099>t give our best vote to this game is because it is closely related to an event after which the title could lose part of its attractiveness (there are no league teams and there is no substantial change in the multiplayer mode). Other than this, the best just got better! FIFA World Cup 2010 is just a few inches from perfection.


[Sentence: "The only reason why we didnâ<U+0080><U+0099>t give our best vote to this game is because it is closely related to an event after which the title could lose part of its attractiveness (there are no league teams and there is no substantial change in the multiplayer mode). Other than this, the best just got better! FIFA World Cup 2010 is just a few inches from perfection." - 66 Tokens]

In [61]:
tagger = flair.models.SequenceTagger.load('nl-ner')

2019-10-02 14:43:39,071 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/NER-conll2002-dutch/nl-ner-conll02-v0.1.pt not found in cache, downloading to /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3


100%|██████████| 1392544535/1392544535 [01:41<00:00, 13693478.51B/s]

2019-10-02 14:45:20,972 copying /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3 to cache at /Users/Jules/.flair/models/nl-ner-conll02-v0.1.pt





2019-10-02 14:45:22,915 removing temp file /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3
2019-10-02 14:45:23,022 loading file /Users/Jules/.flair/models/nl-ner-conll02-v0.1.pt


In [76]:
sentence = Sentence('Eetzaal van de aan het klooster verbonden kweekschool. De meisjes die daar werden opgeleid tot onderwijzeres verbleven intern in een internaat. De enige versieringen in deze zaal bestaan uit religieuze voorstellingen, waaronder een Mariabeeld en een H. Hartbeeld.')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
# print(sentence.to_tagged_string())

[Sentence: "Eetzaal van de aan het klooster verbonden kweekschool. De meisjes die daar werden opgeleid tot onderwijzeres verbleven intern in een internaat. De enige versieringen in deze zaal bestaan uit religieuze voorstellingen, waaronder een Mariabeeld en een H. Hartbeeld." - 38 Tokens]

In [77]:
for entity in sentence.get_spans('ner'):
    print(entity)

MISC-span [34]: "Mariabeeld"
MISC-span [37,38]: "H. Hartbeeld."


In [87]:
raw = "Sint-Janssingel gezien in zuidelijke richting ter hoogte van de Stationsbrug, later Wilhelminabrug. Midden op de achtergrond de molen bij het Wilhelminapark. Rechts de nieuwe bebouwing aan de Kloostersingel, hoek Luybenstraat. De rest van de nieuwe wijk het Zand is nog onbebouwd. Achter de bebouwing is de kerk van de paters Kapucijnen te zien. Links de Sint-Janssingel met daarachter het kloostercomplex van de Marienburg. Fotograaf is Herman de Ruijter HdR"

sentences = [Sentence(sentence) for sentence in raw.split('.')]

for sentence in sentences:
    print(sentence)
    tagger.predict(sentence)
    
    for entity in sentence.get_spans('ner'):
        print(entity)
        
    print('--------------------------------')

Sentence: "Sint-Janssingel gezien in zuidelijke richting ter hoogte van de Stationsbrug, later Wilhelminabrug" - 12 Tokens
LOC-span [1]: "Sint-Janssingel"
LOC-span [10]: "Stationsbrug,"
LOC-span [12]: "Wilhelminabrug"
--------------------------------
Sentence: "Midden op de achtergrond de molen bij het Wilhelminapark" - 9 Tokens
LOC-span [9]: "Wilhelminapark"
--------------------------------
Sentence: "Rechts de nieuwe bebouwing aan de Kloostersingel, hoek Luybenstraat" - 9 Tokens
LOC-span [7]: "Kloostersingel,"
LOC-span [9]: "Luybenstraat"
--------------------------------
Sentence: "De rest van de nieuwe wijk het Zand is nog onbebouwd" - 11 Tokens
LOC-span [8]: "Zand"
--------------------------------
Sentence: "Achter de bebouwing is de kerk van de paters Kapucijnen te zien" - 12 Tokens
PER-span [10]: "Kapucijnen"
--------------------------------
Sentence: "Links de Sint-Janssingel met daarachter het kloostercomplex van de Marienburg" - 10 Tokens
LOC-span [3]: "Sint-Janssingel"
LOC-sp