In [1]:
import nltk
import pandas as pd

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from toolbox import ToolBox
from textblob import TextBlob

# nltk.download('vader_lexicon')

In [14]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores('This is not cool')

{'neg': 0.395, 'neu': 0.605, 'pos': 0.0, 'compound': -0.2411}

In [2]:
tb = ToolBox()

In [15]:
reviews = tb.load_data_sql(table='user_reviews', where="lang = 'en'")
reviews.head()

Loaded from cache


Unnamed: 0,game,username,date,grade,review,lang,helpful_nb,helpful_nb_total
0,007-legends-pc,Bughyman1000,2013-01-19,30,"Oh, my dear God! What is it with developers th...",en,0,0
1,007-legends-pc,DiabloZiri,2014-06-23,30,Seriously if you want to have a HUGE laugh wit...,en,0,0
2,007-legends-pc,DrugsMeazureTim,2014-12-11,0,godawful port of the xbox 360 version and ps3....,en,0,0
3,007-legends-pc,DustEater,2012-11-03,0,Agree. Worst game ever. Its a full copy of Cal...,en,8,8
4,007-legends-pc,evry1isacritic,2012-11-04,0,DO NOT BUY THIS GAME for the PC (or for any ot...,en,2,2


In [30]:
def vader_sentiment_to_df(dataframe):
    '''
        Gets the sentiment scores of reviews using nltk.sentiment.vader.
        
        Input: dataframe with reviews
        Output: original dataframe + column with sentiment score
    '''
    
    sid = SentimentIntensityAnalyzer()
    compound = []
    
    # iterate over dataframe and calculate sentiment scores per row
    for idx, row in dataframe.iterrows():
        
        scores = list(sid.polarity_scores(row['review']).values())
        
        comp = scores[3]
        
        compound.append(comp)
    
    # turns compound list into a dataframe
    compound_df = pd.DataFrame.from_dict({'compound':compound})
    
    # adds sentiment column to existing dataframe
    new_dataframe = pd.concat([dataframe, compound_df], axis = 1)
    
    # drops unnecessary columsn
    new_dataframe = new_dataframe.drop(columns = ['username', 'lang', 'helpful_nb', 'helpful_nb_total'])
    
    # renames compound column to match database
    new_dataframe.rename(columns={'compound':'sentiment'}, inplace=True)
    new_dataframe.rename(columns={'index':'id'}, inplace=True)
    
    return new_dataframe

In [None]:
new_df = vader_sentiment_to_df(reviews)

In [32]:
new_df.head()

Unnamed: 0,game,date,grade,review,sentiment
0,007-legends-pc,2013-01-19,30,"Oh, my dear God! What is it with developers th...",-0.9191
1,007-legends-pc,2014-06-23,30,Seriously if you want to have a HUGE laugh wit...,0.9424
2,007-legends-pc,2014-12-11,0,godawful port of the xbox 360 version and ps3....,-0.3869
3,007-legends-pc,2012-11-03,0,Agree. Worst game ever. Its a full copy of Cal...,0.5496
4,007-legends-pc,2012-11-04,0,DO NOT BUY THIS GAME for the PC (or for any ot...,-0.9612


In [97]:
# write new_df to pickle file
new_df.to_pickle('user_review_clean')

In [2]:
# load pickle file
new_df = pd.read_pickle('user_review_clean')

#### Trying to load dataframe into the database.....fail

In [11]:
from models import *

In [39]:
data = new_df.to_dict(orient='records')

In [84]:
UserReviewsClean.insert(data).execute()

In [89]:
# You can, of course, wrap this in a transaction as well:
with database.atomic():
    UserReviewsClean.insert_many(data).execute()

In [44]:
# Insert rows 100 at a time.
with database.atomic():
    for idx in range(0, len(data), 100):
        UserReviewsClean.insert_many(data[idx:idx+100]).execute()

### Experimentation with Flair

In [49]:
import flair
from flair.data import Sentence

flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

2019-10-02 14:28:03,804 loading file /Users/Jules/.flair/models/imdb-v0.4.pt


In [88]:
review = reviews[reviews['grade'] > 90].iloc[12]['review']

print(review)

s = flair.data.Sentence(review)
flair_sentiment.predict(s)

The only reason why we didnâ<U+0080><U+0099>t give our best vote to this game is because it is closely related to an event after which the title could lose part of its attractiveness (there are no league teams and there is no substantial change in the multiplayer mode). Other than this, the best just got better! FIFA World Cup 2010 is just a few inches from perfection.


[Sentence: "The only reason why we didnâ<U+0080><U+0099>t give our best vote to this game is because it is closely related to an event after which the title could lose part of its attractiveness (there are no league teams and there is no substantial change in the multiplayer mode). Other than this, the best just got better! FIFA World Cup 2010 is just a few inches from perfection." - 66 Tokens]

In [61]:
tagger = flair.models.SequenceTagger.load('nl-ner')

2019-10-02 14:43:39,071 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/NER-conll2002-dutch/nl-ner-conll02-v0.1.pt not found in cache, downloading to /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3


100%|██████████| 1392544535/1392544535 [01:41<00:00, 13693478.51B/s]

2019-10-02 14:45:20,972 copying /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3 to cache at /Users/Jules/.flair/models/nl-ner-conll02-v0.1.pt





2019-10-02 14:45:22,915 removing temp file /var/folders/h6/c23ktx711x517m98bmjdb8jh0000gn/T/tmp6n0clre3
2019-10-02 14:45:23,022 loading file /Users/Jules/.flair/models/nl-ner-conll02-v0.1.pt


In [76]:
sentence = Sentence('Eetzaal van de aan het klooster verbonden kweekschool. De meisjes die daar werden opgeleid tot onderwijzeres verbleven intern in een internaat. De enige versieringen in deze zaal bestaan uit religieuze voorstellingen, waaronder een Mariabeeld en een H. Hartbeeld.')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
# print(sentence.to_tagged_string())

[Sentence: "Eetzaal van de aan het klooster verbonden kweekschool. De meisjes die daar werden opgeleid tot onderwijzeres verbleven intern in een internaat. De enige versieringen in deze zaal bestaan uit religieuze voorstellingen, waaronder een Mariabeeld en een H. Hartbeeld." - 38 Tokens]

In [77]:
for entity in sentence.get_spans('ner'):
    print(entity)

MISC-span [34]: "Mariabeeld"
MISC-span [37,38]: "H. Hartbeeld."


In [87]:
raw = "Sint-Janssingel gezien in zuidelijke richting ter hoogte van de Stationsbrug, later Wilhelminabrug. Midden op de achtergrond de molen bij het Wilhelminapark. Rechts de nieuwe bebouwing aan de Kloostersingel, hoek Luybenstraat. De rest van de nieuwe wijk het Zand is nog onbebouwd. Achter de bebouwing is de kerk van de paters Kapucijnen te zien. Links de Sint-Janssingel met daarachter het kloostercomplex van de Marienburg. Fotograaf is Herman de Ruijter HdR"

sentences = [Sentence(sentence) for sentence in raw.split('.')]

for sentence in sentences:
    print(sentence)
    tagger.predict(sentence)
    
    for entity in sentence.get_spans('ner'):
        print(entity)
        
    print('--------------------------------')

Sentence: "Sint-Janssingel gezien in zuidelijke richting ter hoogte van de Stationsbrug, later Wilhelminabrug" - 12 Tokens
LOC-span [1]: "Sint-Janssingel"
LOC-span [10]: "Stationsbrug,"
LOC-span [12]: "Wilhelminabrug"
--------------------------------
Sentence: "Midden op de achtergrond de molen bij het Wilhelminapark" - 9 Tokens
LOC-span [9]: "Wilhelminapark"
--------------------------------
Sentence: "Rechts de nieuwe bebouwing aan de Kloostersingel, hoek Luybenstraat" - 9 Tokens
LOC-span [7]: "Kloostersingel,"
LOC-span [9]: "Luybenstraat"
--------------------------------
Sentence: "De rest van de nieuwe wijk het Zand is nog onbebouwd" - 11 Tokens
LOC-span [8]: "Zand"
--------------------------------
Sentence: "Achter de bebouwing is de kerk van de paters Kapucijnen te zien" - 12 Tokens
PER-span [10]: "Kapucijnen"
--------------------------------
Sentence: "Links de Sint-Janssingel met daarachter het kloostercomplex van de Marienburg" - 10 Tokens
LOC-span [3]: "Sint-Janssingel"
LOC-sp