In [1]:
import spacy
from collections import Counter
from spacy import displacy

In [2]:
nlp = spacy.load ('en_core_web_sm')

In [3]:
text = open("Music was my first love - John Miles.txt").read()
text = text.replace("\n", " ").strip()
text

'Music was my first love And it will be my last. Music of the future And music of the past.  To live without my music Would be impossible to do. In this world of troubles, My music pulls me through.  Music was my first love And it will be last. Music of the future And music of the past And music of the past And music of the past.  Music was my first love And it will be my last. Music of the future And music of the past.  To live without my music Would be impossible to do. In this world of troubles, My music pulls me through.'

In [4]:
# Normalizes the text, deletes punctuation and spaces, 
# changes all cases into lower cases

def normalize_ws (text):
    norm_text = []
    for token in text:
            if not token.is_punct and not token.is_space:
                    norm_text.append(token.lemma_.lower())
    return ' '.join(norm_text)

In [5]:
doc = nlp(text)

In [6]:
print (doc)

Music was my first love And it will be my last. Music of the future And music of the past.  To live without my music Would be impossible to do. In this world of troubles, My music pulls me through.  Music was my first love And it will be last. Music of the future And music of the past And music of the past And music of the past.  Music was my first love And it will be my last. Music of the future And music of the past.  To live without my music Would be impossible to do. In this world of troubles, My music pulls me through.


In [7]:
for token in doc: 
    print (token.i, token.text, token.dep_)

0 Music nsubj
1 was ROOT
2 my poss
3 first amod
4 love attr
5 And cc
6 it nsubj
7 will aux
8 be conj
9 my poss
10 last attr
11 . punct
12 Music ROOT
13 of prep
14 the det
15 future pobj
16 And cc
17 music conj
18 of prep
19 the det
20 past pobj
21 . punct
22   dep
23 To aux
24 live csubj
25 without prep
26 my poss
27 music pobj
28 Would aux
29 be ROOT
30 impossible acomp
31 to aux
32 do xcomp
33 . punct
34 In prep
35 this det
36 world pobj
37 of prep
38 troubles pobj
39 , punct
40 My poss
41 music nsubj
42 pulls ROOT
43 me dobj
44 through prt
45 . punct
46   dep
47 Music nsubj
48 was ROOT
49 my poss
50 first amod
51 love attr
52 And cc
53 it nsubj
54 will aux
55 be ROOT
56 last acomp
57 . punct
58 Music ROOT
59 of prep
60 the det
61 future pobj
62 And cc
63 music conj
64 of prep
65 the det
66 past pobj
67 And cc
68 music conj
69 of prep
70 the det
71 past pobj
72 And cc
73 music conj
74 of prep
75 the det
76 past pobj
77 . punct
78   dep
79 Music nsubj
80 was ROOT
81 my poss
82 first a

In [8]:
displacy.render(doc, style="dep")

In [9]:
# Call of function to normalize the text with stopwords
# and put it back into an spacy nlp document
normalized_ws_text = normalize_ws(doc)
normalized_ws_doc = nlp(normalized_ws_text)
# Nouns with spaceholders
nouns_ws = [ token.text for token in normalized_ws_doc if token.pos_ == 'NOUN']
# Verbs with spaceholders
verbs_ws = [ token.text for token in normalized_ws_doc if token.pos_ == 'VERB']

In [10]:
# Print Nouns with stopwords
print(nouns_ws)

['music', 'love', 'music', 'future', 'music', 'past', 'music', 'world', 'trouble', 'music', 'music', 'love', 'music', 'future', 'music', 'past', 'music', 'past', 'music', 'music', 'love', 'music', 'future', 'music', 'past', 'music', 'world', 'trouble', 'music']


In [11]:
# Creates a counter on the nouns with stopwords
word_freq_ws = Counter(nouns_ws)
#Saves the ten most common nouns
common_nouns_ws = word_freq_ws.most_common(10)

In [12]:
# Prints the ten most common nouns.
print(common_nouns_ws) # with stopwords

[('music', 15), ('past', 4), ('love', 3), ('future', 3), ('world', 2), ('trouble', 2)]


In [13]:
# Prints the ten most common nouns. 
print(Counter(verbs_ws).most_common(10)) # with stopwords

[('live', 2), ('do', 2), ('pull', 2)]
