In [15]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('At midnight the doorbell rang, startling him fearfully.')

In [16]:
for token in doc: 
    print(token.text, token.lemma_, token.pos_)

At at ADP
midnight midnight NOUN
the the DET
doorbell doorbell NOUN
rang rang NOUN
, , PUNCT
startling startle VERB
him he PRON
fearfully fearfully ADV
. . PUNCT


In [17]:
spacy.displacy.render(doc, style='dep')

In [30]:
text = """Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.
I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well.
Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all.
I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.
I think every person has different interests. I have friends that have completely different jobs and interests, and I've met them in very different parts of my life. I think everyone just thinks because we're tennis players we should be the greatest of friends.
But ultimately tennis is just a very small part of what we do. There are so many other things that we're interested in, that we do."""

In [31]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Maria Sharapova PERSON
the WTA Tour ORG
Russian NORP
the next few minutes TIME


In [32]:
import spacy

nlp_sm = spacy.load("en_core_web_sm")

def create_word_counts_by_pos(raw_text, list_of_pos, word_count_dict_input = None):
    """
    takes a raw text file
    tokenizes and lemmatizes it
    limits inspection to list_of_pos types of words
    counts the individual lemmas
    returns a dictionary, keys are pos's in list_of_pos
    values are dictinaries with word counts
    """

    doc = nlp_sm(raw_text)

    if word_count_dict_input is None: 
        word_count_dict = {}
        for part_of_speech in list_of_pos:
            word_count_dict[part_of_speech] = {}
    else:
        word_count_dict = word_count_dict_input

    for token in doc: 
        part_of_speech = token.pos_

        if part_of_speech in list_of_pos and token.is_stop == False:
            word_lemma = token.lemma_
            current_count = word_count_dict[part_of_speech].get(word_lemma, 0)
            current_count += 1
            word_count_dict[part_of_speech][word_lemma] = current_count

    return word_count_dict

def filter_word_count_dict_to_frequent(word_count_dict, threshold):
    """
    Loops through word_count_dict, only keeps items where 
    value is higher than a certain threshold
    """
    frequent_word_count_dict = {}

    list_of_pos = word_count_dict.keys()

    for part_of_speech in list_of_pos:
        frequent_word_count_dict[part_of_speech] = {}
        for key in word_count_dict[part_of_speech]:
            if word_count_dict[part_of_speech][key] > threshold:
                frequent_word_count_dict[part_of_speech][key] = \
                word_count_dict[part_of_speech][key]
                
    return frequent_word_count_dict

def collect_most_frequent_words(word_count_dict, number_to_collect):
    """
    word_count_dict is assumed to be in a format where keys are part-of-speech, 
    values are counts
    number_of_collect: we will collect this amount from each group
    if there is a tie: the one that appeared first
    """

    list_of_pos = word_count_dict.keys()
    most_frequent_words = {}

    for part_of_speech in list_of_pos:
        most_frequent_words[part_of_speech] = \
        sorted(word_count_dict[part_of_speech].items(), \
            key=lambda x: x[1], reverse = True)[:number_to_collect] 
        
    return most_frequent_words
        

In [33]:
list_of_pos = ['NOUN', 'PROPN', 'ADJ', 'VERB']

In [34]:
word_count_dict = create_word_counts_by_pos(text, list_of_pos)

In [35]:
word_count_dict

{'NOUN': {'friend': 5,
  'tennis': 6,
  'player': 8,
  'problem': 1,
  'interview': 1,
  'feeling': 1,
  'job': 2,
  'court': 3,
  'playing': 1,
  'competitor': 1,
  'person': 2,
  'locker': 1,
  'room': 1,
  'net': 1,
  'conversation': 1,
  'weather': 1,
  'minute': 1,
  'match': 1,
  'girl': 1,
  'hello': 1,
  'flower': 1,
  'lot': 2,
  'man': 1,
  'tour': 2,
  'woman': 1,
  'sport': 1,
  'interest': 2,
  'part': 1,
  'life': 1,
  'thing': 1},
 'PROPN': {'Maria': 1, 'Sharapova': 1, 'WTA': 1, 'Tour': 1, 'Uhm': 1},
 'ADJ': {'russian': 1,
  'recent': 1,
  'single': 1,
  'competitive': 1,
  'friendly': 1,
  'close': 2,
  'strategic': 1,
  'different': 4,
  'great': 1,
  'small': 1,
  'interested': 1},
 'VERB': {'speak': 1,
  'say': 2,
  'hide': 1,
  'think': 5,
  'know': 2,
  'want': 1,
  'beat': 1,
  'strike': 1,
  'try': 1,
  'win': 1,
  'send': 1,
  'mean': 1,
  'categorize': 1,
  'go': 1,
  'meet': 1}}

In [25]:
frequent_word_count_dict = filter_word_count_dict_to_frequent(word_count_dict, 10)

In [26]:
frequent_word_count_dict

{'NOUN': {}, 'PROPN': {}, 'ADJ': {}, 'VERB': {}}

In [27]:
number_of_words_to_collect = 5
list_of_pos = ['NOUN', 'ADJ', 'VERB']
words = {}
word_counts = {}
for part_of_speech in list_of_pos:
    words[part_of_speech] = {}
    word_counts[part_of_speech] = {}
    for number in range(number_of_words_to_collect):
        words[part_of_speech][number] = []
        word_counts[part_of_speech][number] = []

In [29]:
for filename in filenames: 
    my_file = open(txt_adjusted_folder + filename + '.txt')
    print("Currently processing: " + filename)
    raw_text = my_file.read()
    word_count_dict = create_word_counts_by_pos(
            raw_text, list_of_pos)
    most_frequent_list_dict = collect_most_frequent_words(
            word_count_dict, number_of_words_to_collect)
    for part_of_speech in list_of_pos:
        for number in range(number_of_words_to_collect):
            words[part_of_speech][number].append(
                most_frequent_list_dict[part_of_speech][number][0])
            word_counts[part_of_speech][number].append(
                most_frequent_list_dict[part_of_speech][number][1])

NameError: name 'filenames' is not defined

In [2]:
sentences = [
    "Indonesia merupakan negara kepulauan yang kaya akan budaya.",
    "Berapa banyak warga yang dibutuhkan saat kerja bakti?",
    "Penyaluran pupuk berasal dari lima lokasi yakni Bontang, Kalimantan Timur, Surabaya, Banyuwangi, Semarang, dan Makassar.",
    "PT Pupuk Kaltim telah menyalurkan 274.707 ton pupuk bersubsidi ke wilayah penyaluran di 14 provinsi.",
    "Jakarta adalah kota besar yang nyaris tidak pernah tidur."
    "Kamu ada di mana semalam?",
    "Siapa yang membeli makanan ringan tersebut?",
    "Siapa presiden pertama Republik Indonesia?",
]

In [5]:
from spacy.lang.id.examples import sentences
import spacy
nlp = spacy.load('en_core_web_sm')
docs = nlp.pipe(sentences)

In [6]:
docs

<generator object Language.pipe at 0x0000019E92FCC4F8>