#### 1. Import brown corpus from NTLK corpus. Use the news category

In [1]:
from nltk.corpus import brown
import re

In [2]:
news_text = brown.words(categories='news')
news_text_str = " ".join(news_text)


#### 2. Remoe the newline character by using regex  


In [3]:
news_text_str = re.sub(r'\n', '', news_text_str)
news_text_str = news_text_str.strip()


#### 3. Use Spacy to find the named entities in the dataset

In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')
news_text_nlp = nlp(news_text_str)

In [5]:
# print named entities in article
ner_tagged = [(word.text, word.ent_type_) for word in news_text_nlp]


In [6]:
named_entities = []
temp_entity_name = ''
temp_named_entity = None
for term, tag in ner_tagged:
    if tag:
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        temp_named_entity = (temp_entity_name, tag)
    else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
named_entities

[('The Fulton County Grand Jury', 'ORG'),
 ('Friday', 'DATE'),
 ('Atlanta', 'GPE'),
 ('the City Executive Committee', 'ORG'),
 ('the City of Atlanta', 'GPE'),
 ('September - October', 'DATE'),
 ('Fulton Superior Court', 'ORG'),
 ('Durwood Pye', 'PERSON'),
 ('Ivan Allen Jr.', 'PERSON'),
 ('Georgia', 'GPE'),
 ('Fulton', 'ORG'),
 ('Atlanta', 'GPE'),
 ('Fulton County', 'GPE'),
 ('two', 'CARDINAL'),
 ('The City Purchasing Department', 'ORG'),
 ('Georgia', 'GPE'),
 ('Legislature', 'ORG'),
 ("the State Welfare Department 's", 'ORG'),
 ('Fulton County', 'GPE'),
 ('the State Welfare Department', 'ORG'),
 ('Fulton County', 'GPE'),
 ('Fulton County', 'GPE'),
 ('Failure', 'PRODUCT'),
 ('Fulton', 'GPE'),
 ('Fulton', 'ORG'),
 ('two', 'CARDINAL'),
 ('the Atlanta Bar Association', 'ORG'),
 ('Atlanta', 'GPE'),
 ('multi - million - dollar', 'MONEY'),
 ('Jan. 1', 'DATE'),
 ('1', 'CARDINAL'),
 ('Four', 'CARDINAL'),
 ('Fulton County', 'GPE'),
 ('2', 'CARDINAL'),
 ('Fulton', 'ORG'),
 ('the Atlanta Police De

#### 4. Create a named-entity dictionary for the terms with the tag "ORG" and the count of occurrence for each term with the tag "ORG"

In [7]:
named_en_dict_org = {}

for item in named_entities:
    if item[1] == "ORG":
        if item[0] in named_en_dict_org:
            named_en_dict_org[item[0]] += 1
        else:
            named_en_dict_org[item[0]] = 1

named_en_dict_org

{'The Fulton County Grand Jury': 1,
 'the City Executive Committee': 1,
 'Fulton Superior Court': 1,
 'Fulton': 3,
 'The City Purchasing Department': 1,
 'Legislature': 18,
 "the State Welfare Department 's": 1,
 'the State Welfare Department': 1,
 'the Atlanta Bar Association': 1,
 'the Atlanta Police Department': 1,
 "Fulton Tax Commissioner 's Office": 1,
 'Grady Hospital': 3,
 'the Fulton Health Department': 1,
 'GOP': 5,
 'State Party': 1,
 'State Highway Department': 1,
 'Caldwell': 2,
 'The Georgia Legislature': 1,
 'Senate': 27,
 'the State Highway Department': 1,
 'Highway Department': 1,
 'Rural Roads Authority': 1,
 'the Rural Roads Authority': 1,
 'The Highway Department': 1,
 'House': 22,
 'Congress': 17,
 'the State Board of Education': 1,
 'the Miller County Democratic Executive Committee': 1,
 'the county school board': 1,
 "Price Daniel 's": 1,
 'the House Committee on Revenue and Taxation': 1,
 'Daniel': 1,
 'Tyler': 1,
 'the Texas Bankers Association': 1,
 'the Texas

#### 5.Sort the dictionary according the term occurrence count(10 pts) and show the top 10 occurring Named entities with the tag "ORG".(10 pts) Make sure you have the whole entity not just one word of it. 

In [10]:
named_en_dict_org_sort = sorted(named_en_dict_org.items(), key=lambda x: x[1], reverse=True)
for index_ in range(10):
    print(named_en_dict_org_sort[index_][0], named_en_dict_org_sort[index_][1])

Senate 27
House 22
Legislature 18
Congress 17
Hughes 15
Yankees 15
AP 14
U.N. 13
White House 12
Mantle 12


#### 6. Create a named-entity dictionary for the terms with the tag "PERSON" and the count of occurrence for each term with the tag "PERSON". (10 pts)

In [17]:
named_en_dict_person = {}

for item in named_entities:
    if item[1] == "PERSON":
        if item[0] in named_en_dict_person:
            named_en_dict_person[item[0]] += 1
        else:
            named_en_dict_person[item[0]] = 1

named_en_dict_person

{'Durwood Pye': 1,
 'Ivan Allen Jr.': 2,
 'William B. Hartsfield': 1,
 'William Berry Jr.': 1,
 'J. M. Cheshire': 1,
 '637 E. Pelham Rd': 1,
 'Henry L. Bowden': 2,
 'Hartsfield': 2,
 'Robert Snodgrass': 1,
 'James W. Dorsey': 1,
 'John Tower': 1,
 'Sam Caldwell': 1,
 'Garland': 2,
 'Rob Ledford': 1,
 'Vandiver': 1,
 'Marvin Griffin': 1,
 'B. D. Pelham': 1,
 'Pelham': 3,
 'Mac Barber': 1,
 'Barber': 3,
 'Harry Davis': 1,
 'Felix Bush': 1,
 'Davis': 8,
 'Bush': 2,
 'Carey Williams': 1,
 'Tom Williams': 1,
 'George P. Callan': 1,
 'Williams': 5,
 'Austin': 5,
 'Daniel termed': 1,
 'Daniel': 1,
 'Dewey Lawrence': 1,
 'Charles E. Hughes': 1,
 'Scott Hudson': 1,
 'Sherman': 4,
 'J. B. Brady': 1,
 'Harlingen': 1,
 'Howard Cox': 1,
 'George Parkhouse': 2,
 'Harris': 1,
 'Bexar': 1,
 'V. E.': 1,
 'Berry': 4,
 'Joe Ratcliff': 1,
 'Ratcliff': 2,
 'A. R. Schwartz': 1,
 'Louis Crump': 1,
 'A. M. Aikin Jr.': 1,
 'Marshall Formby': 1,
 'Formby': 1,
 'Wesley Roberts': 1,
 'Bill Hollowell': 1,
 'Robert

#### 7. Sort the dictionary according the term occurrence count(10 pts) and show the top 10 occurring Named entities with the tag "PERSON"(10 pts). Make sure you have the whole entity not just one word of it.

In [18]:
named_en_dict_person_sort = sorted(named_en_dict_person.items(), key=lambda x: x[1], reverse=True)
for var_ in range(10):
    print(named_en_dict_person_sort[var_][0], named_en_dict_person_sort[var_][1])

Kennedy 59
Maris 34
Mantle 27
Khrushchev 26
Stein 13
Mitchell 12
Kowalski 12
Player 12
Jr. 11
Robinson 11


#### 8. Use SpaCy's displacy module to highlight named entities in the Brown news text category.

In [20]:
from spacy import displacy

# visualize named entities
displacy.render(news_text_nlp, style='ent', jupyter=True)

#### 9. Use the keywords module from Gensim.summarization framework to find the keywords in the text file that you imported. Print out top 25 keywords according to their scores. (Parameters: scores and lemmatize parameter is True). Round the scores to 2 decimals.

In [4]:
from gensim.summarization import keywords

key_words = keywords(news_text_str, ratio=1.0, scores=True, lemmatize=True)
[(item, round(score, 2)) for item, score in key_words][:25]

[('mrs', 0.34),
 ('said', 0.25),
 ('stating', 0.14),
 ('years', 0.12),
 ('news ripened', 0.1),
 ('nationalized', 0.1),
 ('misses', 0.09),
 ('presidency', 0.09),
 ('johns', 0.09),
 ('schooling', 0.08),
 ('housed', 0.08),
 ('home', 0.08),
 ('counties', 0.07),
 ('highs', 0.07),
 ('dais', 0.07),
 ('cities', 0.07),
 ('includes', 0.07),
 ('americans', 0.07),
 ('marie', 0.07),
 ('timing', 0.07),
 ('manned', 0.06),
 ('williams', 0.06),
 ('publicity', 0.06),
 ('works', 0.06),
 ('generated', 0.06)]