In [None]:
import nltk
import pandas as pd

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from toolbox import ToolBox
from textblob import TextBlob

# nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores('This is not cool')

In [None]:
tb = ToolBox()

In [None]:
reviews = tb.load_data_sql(table='user_reviews', where="lang = 'en'")
reviews.head()

In [None]:
def vader_sentiment_to_df(dataframe):
    '''
        Gets the sentiment scores of reviews using nltk.sentiment.vader.
        
        Input: dataframe with reviews
        Output: original dataframe + column with sentiment score
    '''
    
    sid = SentimentIntensityAnalyzer()
    compound = []
    
    # iterate over dataframe and calculate sentiment scores per row
    for idx, row in dataframe.iterrows():
        
        scores = list(sid.polarity_scores(row['review']).values())
        
        comp = scores[3]
        
        compound.append(comp)
    
    # turns compound list into a dataframe
    compound_df = pd.DataFrame.from_dict({'compound':compound})
    
    # adds sentiment column to existing dataframe
    new_dataframe = pd.concat([dataframe, compound_df], axis = 1)
    
    # drops unnecessary columsn
    new_dataframe = new_dataframe.drop(columns = ['username', 'lang', 'helpful_nb', 'helpful_nb_total'])
    
    # renames compound column to match database
    new_dataframe.rename(columns={'compound':'sentiment'}, inplace=True)
    new_dataframe.rename(columns={'index':'id'}, inplace=True)
    
    return new_dataframe

In [None]:
new_df = vader_sentiment_to_df(reviews)

In [None]:
new_df.head()

In [None]:
# write new_df to pickle file
new_df.to_pickle('user_review_clean.pkl')

In [None]:
# load pickle file
new_df = pd.read_pickle('user_review_clean.pkl')

#### Trying to load dataframe into the database.....fail

In [None]:
from models import *

In [None]:
new_df['date'] = new_df['date'].astype(str)

In [None]:
data = new_df.to_dict(orient='records')

In [None]:
# Insert rows 100 at a time.
with database.atomic():
    for idx in range(0, len(data), 1000):
        UserReviewsClean.insert_many(data[idx:idx+1000]).execute()

### Experimentation with Flair

In [None]:
import flair
from flair.data import Sentence

flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [None]:
review = reviews[reviews['grade'] > 90].iloc[12]['review']

print(review)

s = flair.data.Sentence(review)
flair_sentiment.predict(s)

In [None]:
tagger = flair.models.SequenceTagger.load('nl-ner')

In [None]:
sentence = Sentence('Eetzaal van de aan het klooster verbonden kweekschool. De meisjes die daar werden opgeleid tot onderwijzeres verbleven intern in een internaat. De enige versieringen in deze zaal bestaan uit religieuze voorstellingen, waaronder een Mariabeeld en een H. Hartbeeld.')

# predict NER tags
tagger.predict(sentence)

# print sentence with predicted tags
# print(sentence.to_tagged_string())

In [None]:
for entity in sentence.get_spans('ner'):
    print(entity)

In [None]:
raw = "Sint-Janssingel gezien in zuidelijke richting ter hoogte van de Stationsbrug, later Wilhelminabrug. Midden op de achtergrond de molen bij het Wilhelminapark. Rechts de nieuwe bebouwing aan de Kloostersingel, hoek Luybenstraat. De rest van de nieuwe wijk het Zand is nog onbebouwd. Achter de bebouwing is de kerk van de paters Kapucijnen te zien. Links de Sint-Janssingel met daarachter het kloostercomplex van de Marienburg. Fotograaf is Herman de Ruijter HdR"

sentences = [Sentence(sentence) for sentence in raw.split('.')]

for sentence in sentences:
    print(sentence)
    tagger.predict(sentence)
    
    for entity in sentence.get_spans('ner'):
        print(entity)
        
    print('--------------------------------')