# Word2Vec(Spacy)

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [5]:
#nlp(u'lion').vector
nlp(u'This is a new sentence').vector

(300,)

In [10]:
#tokens = nlp(u'lion cat pet')
tokens = nlp(u'like love hate')

In [11]:
#Now after we have tokenized, check the similarity of tokens with each other using for loops
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.657904
like hate 0.6574652
love like 0.657904
love love 1.0
love hate 0.6393099
hate like 0.6574652
hate love 0.6393099
hate hate 1.0


In [14]:
#Look at current vocabulary - total no of words/vectors
nlp.vocab.vectors.shape

(684831, 300)

In [16]:
tokens = nlp(u'dog cat Mukunthan')
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
Mukunthan False 0.0 True


In [19]:
from scipy import spatial
#Calculate cosine similarity ourselves. Takes vec1 , vec2 outputs cosine similarity.
cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [20]:
king = nlp.vocab[u'king'].vector
man = nlp.vocab[u'man'].vector
woman = nlp.vocab[u'woman'].vector

In [21]:
#New vector will be fresh vector not necessarily in the 684831 in spacy. Find closest neighboring vector.
new_vector = king - man + woman

In [22]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [23]:
computed_similarities = sorted(computed_similarities, key = lambda item: -item[1])
#Sort the list in descending order of similarity value which is in index 1 in the list of tuples. -item for descending

In [24]:
print([t[0].text for t in computed_similarities[:10]])
#List comprehension - iterate a list using a for loop and store the result in a result.

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']


# VADER(NLTK) - Sentiment Analysis

In [25]:
import nltk

In [26]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\RISHI
[nltk_data]     MUKUNTHAN\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [28]:
sid = SentimentIntensityAnalyzer()

In [33]:
#text = "This is a good movie"
#text = "This is a good movie and BEST awesome movie!!!"
text = "This is a bad movie and WORST  movie EVER MADE"

In [34]:
sid.polarity_scores(text)

{'neg': 0.543, 'neu': 0.457, 'pos': 0.0, 'compound': -0.8531}

## Sentiment Analysis on Amazon Dataset

In [35]:
import pandas as pd

In [36]:
df = pd.read_csv('amazonreviews.tsv', sep='\t')

In [37]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [38]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [42]:
#Drop missing or white space values
df.dropna(inplace=True)

blanks =[]
for index, lb, rv in df.itertuples():
    if type(rv) ==str:
        if rv.isspace():
            blanks.append(i)

In [44]:
#blanks
#df.drop(blanks, inplace=True)

In [45]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [46]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [47]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [48]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [49]:
df['compound'] = df['scores'].apply(lambda scores: scores['compound'])

In [50]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [51]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score>= 0 else 'neg')

In [52]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [53]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [57]:
print(accuracy_score(df['label'], df['comp_score']))

0.7091


In [58]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [59]:
print(confusion_matrix(df['label'], df['comp_score']))

[[2623 2474]
 [ 435 4468]]


It is not the best but does well with a accuracy of 70 percent. Sarcasm cannot be identified whioch could have been the challenge. There are also some state of the art deep learning methods for sentiment analysis