In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
nlp(u'The quick brown fox jumped').vector.shape

(300,)

In [6]:
tokens = nlp(u'like love hate')

In [7]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

like like 1.0
like love 0.65790397
like hate 0.6574653
love like 0.65790397
love love 1.0
love hate 0.6393099
hate like 0.6574653
hate love 0.6393099
hate hate 1.0


In [9]:
nlp.vocab.vectors.shape

(684831, 300)

In [12]:
tokens = nlp(u'Ian Dan Ramalingaswami')

In [13]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Ian True 5.6380367 False
Dan True 7.1883316 False
Ramalingaswami False 0.0 True


In [15]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [39]:
king = nlp.vocab['Ramesh'].vector
man = nlp.vocab['Mahesh'].vector
women = nlp.vocab['Suresh'].vector

In [40]:
### king - man + woman ---> NEW_VECTOR similar to Queen, princess, highness

In [41]:
new_vector = king - man + women

In [42]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector,word.vector)
                computed_similarities.append((word,similarity))

In [43]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])

In [44]:
print([t[0].text for t in computed_similarities[:10]])

['ramesh', 'suresh', 'sandeep', 'dinesh', 'varun', 'srinivas', 'senthil', 'pradeep', 'vineet', 'arun']


In [45]:
import nltk

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [46]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\RishiBhatt\AppData\Roaming\nltk_data...


True

In [47]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [48]:
sid = SentimentIntensityAnalyzer()

In [49]:
a = "This is a good movie"

In [50]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [51]:
a = "This was the best, most awesome movie EVER MADE !!!"

In [52]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.458, 'pos': 0.542, 'compound': 0.8877}

In [53]:
a = "This was the worst that has ever disgraced the screen"

In [54]:
sid.polarity_scores(a)

{'neg': 0.47, 'neu': 0.53, 'pos': 0.0, 'compound': -0.7964}

In [55]:
import pandas as pd

In [56]:
df = pd.read_csv('UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv', sep='\t')

In [57]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [58]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [59]:
df.dropna(inplace=True)

In [60]:
blanks=[]
for i,lb,rv in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
            

In [62]:
df.drop(blanks,inplace=True)

In [63]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [64]:
sid.polarity_scores(df.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [65]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

In [66]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [70]:
df['compound'] = df['scores'].apply(lambda score: score['compound'])

In [69]:
df.drop('compounnd', axis=1, inplace=True)

In [71]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [72]:
df['comp_score'] = df['compound'].apply(lambda comp: 'pos' if comp > 0 else 'neg')

In [73]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [74]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [75]:
accuracy_score(df['label'], df['comp_score'])

0.713

In [76]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.85      0.53      0.65      5097
         pos       0.65      0.90      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.72      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [77]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2716 2381]
 [ 489 4414]]


In [78]:
df = pd.read_csv('UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv', sep='\t')

In [79]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [80]:
df.dropna(inplace=True)

In [81]:
blanks=[]
for i,lb,rv in df.itertuples():
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [83]:
df.drop(blanks,inplace=True)

In [84]:
len(df)

1938

In [85]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [86]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [87]:
sid = SentimentIntensityAnalyzer()

In [88]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [89]:
df.head()

Unnamed: 0,label,review,scores
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [90]:
df['compound'] = df['scores'].apply(lambda scores: scores['compound'])

In [91]:
df.head()

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


In [92]:
df['comp_score'] = df['compound'].apply(lambda comp: 'pos' if comp >= 0 else 'neg')

In [93]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [94]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [95]:
print(accuracy_score(df['label'], df['comp_score']))

0.6357069143446853


In [96]:
print(confusion_matrix(df['label'], df['comp_score']))

[[427 542]
 [164 805]]


In [97]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

   micro avg       0.64      0.64      0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

