In [46]:

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Rob\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
sid = SentimentIntensityAnalyzer()

In [48]:
message_text = '''Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me.'''

In [49]:
scores = sid.polarity_scores(message_text)
scores

{'neg': 0.093, 'neu': 0.836, 'pos': 0.071, 'compound': -0.3804}

In [50]:
import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_md
nlp = en_core_web_md.load()

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words

In [77]:
tokens = message_text.split()
tokens

['Like',
 'you,',
 'I',
 'am',
 'getting',
 'very',
 'frustrated',
 'with',
 'this',
 'process.',
 'I',
 'am',
 'genuinely',
 'trying',
 'to',
 'be',
 'as',
 'reasonable',
 'as',
 'possible.',
 'I',
 'am',
 'not',
 'trying',
 'to',
 '"hold',
 'up"',
 'the',
 'deal',
 'at',
 'the',
 'last',
 'minute.',
 "I'm",
 'afraid',
 'that',
 'I',
 'am',
 'being',
 'asked',
 'to',
 'take',
 'a',
 'fairly',
 'large',
 'leap',
 'of',
 'faith',
 'after',
 'this',
 'company',
 '(I',
 "don't",
 'mean',
 'the',
 'two',
 'of',
 'you',
 '--',
 'I',
 'mean',
 'Enron)',
 'has',
 'screwed',
 'me',
 'and',
 'the',
 'people',
 'who',
 'work',
 'for',
 'me.']

In [52]:
tokens = []
df = pd.DataFrame(message_text.split(), columns=['words'])

for doc in tokenizer.pipe(df['words']):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop == False) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())

    tokens.append(doc_tokens)

df['tokens'] = tokens

In [69]:
word_list = sum(list([item for item in df['tokens'] if len(item) != 0]), [])
message = ' '.join(word_list)
message

'like you, getting frustrated process. genuinely trying reasonable possible. trying "hold up" deal minute. i\'m afraid asked fairly large leap faith company (i don\'t mean mean enron) screwed people work me.'

In [70]:
scores = sid.polarity_scores(message)
scores

{'neg': 0.17, 'neu': 0.694, 'pos': 0.136, 'compound': -0.3182}

### stemming

In [92]:
lyrics = '''
I never thought you were a saint
I just wanted you to be the one
I always figure you would stay
But now you’re gone
You’re gone

So burn away my time
I want your hand in mine
It hurt to see you go
'cause I was in love with you

I’m caught here balanced on a wire
Bound between your heart and mine
I should have given so much more
Instead I watched you walk away

In my mind
There you are
With me
In your arms
I feel so alive
When you're mine
But here we are
You’re gone
On your own
I’m here
All alone
I just could not speak
A word
So within my mind
Here you are
With me
In your arms
I feel so alive
'''

In [93]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')
sid = SentimentIntensityAnalyzer()

import spacy
from spacy.tokenizer import Tokenizer
import en_core_web_md
nlp = en_core_web_md.load()
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words

import pandas as pd 
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [134]:
def processed_score(data=lyrics):
    tokens = []
    df = pd.DataFrame(data.split(), columns=['words'])

    for doc in tokenizer.pipe(df['words']):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    df['tokens'] = tokens
    word_list = sum(list([item for item in df['tokens'] if len(item) != 0]), [])
    lyrics_processed = ' '.join(word_list)
    scores = sid.polarity_scores(lyrics_processed)

    return scores['compound']

def stemmed_score(data=lyrics):
    """ Processes the text via spacy """
    tokens = []
    words = []

    # Stemming
    for word in data.split():
        words.append(ps.stem(word))

    df = pd.DataFrame(words, columns=['words'])

    for doc in tokenizer.pipe(df['words']):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    df['tokens'] = tokens
    word_list = sum(list([item for item in df['tokens'] if len(item) != 0]), [])
    lyrics_processed = ' '.join(word_list)
    scores = sid.polarity_scores(lyrics_processed)

    return scores['compound']

def get_lemmas(data):
    """ Gets lemmas for text """
    lemmas = []
    doc = nlp(data)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

def lemma_score(data=lyrics):
    """ Processes the text via spacy """
    tokens = []
    words = []

    # Lemmatization
    words = get_lemmas(lyrics)

    df = pd.DataFrame(words, columns=['words'])

    for doc in tokenizer.pipe(df['words']):
        doc_tokens = []
        for token in doc:
            if (token.is_stop == False) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    df['tokens'] = tokens
    word_list = sum(list([item for item in df['tokens'] if len(item) != 0]), [])
    lyrics_processed = ' '.join(word_list)
    scores = sid.polarity_scores(lyrics_processed)

    return scores['compound']



In [137]:
processed_score(), stemmed_score(), lemma_score()

(0.8591, 0.6808, 0.8689)