# Named Entity Extraction using NLTK

requires nltk >3.0

[Data Source - Kaggle](https://www.kaggle.com/shivamb/netflix-shows)

In [2]:
import nltk
import pandas as pd

In [48]:
df = pd.read_csv("data/netflix_titles.csv.zip",compression='zip', sep= ',')
df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [30]:
movie_corpus = " ".join(df.loc[:,'description'])
print(len(movie_corpus))
movie_corpus[:1000]

1270878


"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable. After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth. To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war. Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series. In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life. The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe. Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be 

In [32]:
sentences = nltk.sent_tokenize(movie_corpus)
print(len(sentences))
sentences[:5]

9744


['As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.',
 'After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth.',
 'To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.',
 'Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series.',
 'In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life.']

In [33]:
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print(len(tokenized_sentences))
tokenized_sentences[0]

9744


['As',
 'her',
 'father',
 'nears',
 'the',
 'end',
 'of',
 'his',
 'life',
 ',',
 'filmmaker',
 'Kirsten',
 'Johnson',
 'stages',
 'his',
 'death',
 'in',
 'inventive',
 'and',
 'comical',
 'ways',
 'to',
 'help',
 'them',
 'both',
 'face',
 'the',
 'inevitable',
 '.']

In [34]:
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
print(len(tagged_sentences))
tagged_sentences[0]

9744


[('As', 'IN'),
 ('her', 'PRP$'),
 ('father', 'NN'),
 ('nears', 'VBZ'),
 ('the', 'DT'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('his', 'PRP$'),
 ('life', 'NN'),
 (',', ','),
 ('filmmaker', 'NN'),
 ('Kirsten', 'NNP'),
 ('Johnson', 'NNP'),
 ('stages', 'VBZ'),
 ('his', 'PRP$'),
 ('death', 'NN'),
 ('in', 'IN'),
 ('inventive', 'JJ'),
 ('and', 'CC'),
 ('comical', 'JJ'),
 ('ways', 'NNS'),
 ('to', 'TO'),
 ('help', 'VB'),
 ('them', 'PRP'),
 ('both', 'DT'),
 ('face', 'VBP'),
 ('the', 'DT'),
 ('inevitable', 'JJ'),
 ('.', '.')]

In [35]:
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
# [i for i in chunked_sentences]

In [27]:
def extract_entity_names(t):
    entity_names = []
    
    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))
                
    return entity_names

In [46]:
# entity_names = [extract_entity_names(tree) for tree in chunked_sentences]
entity_list = [" ".join(i) for i in entity_names if i]
list(set(entity_list))

['Omar Salma',
 'Jean Holloway',
 'Comedian Ryuji Akiyama Japan',
 'Tom Jerry',
 'Ryan Hamilton New York Disney World',
 'Taylor Swift',
 'Jake Kansas',
 'Sea',
 'Haruo Exif',
 'Celestina',
 'Rescue Riders',
 'Catholic France Léon Rivail',
 'Join Strawberry Shortcake Berry Bitty City',
 'José Miguel',
 'Tehran',
 'Army Mystic Force Power Rangers',
 'Carson Cory',
 'Louise Down',
 'Shrek',
 'CIA FBI',
 'Israeli Eli Cohen Syria Mossad',
 'Calvin',
 'Robinson',
 'Europe William',
 'Bean',
 'Freed',
 'Green Destiny',
 'Sonny Koufax',
 'Oregon',
 'Convinced',
 'U.S. Mississippi',
 'Steve Harvey Entertainer',
 'Eternia Teela Universe',
 'Lorelai Ivy',
 'Johannesburg',
 'Kitty Galore',
 'Young Mexico',
 'Power Rangers California',
 'Istanbul Mehmet',
 'Monty Python Eric Idle British Empire',
 'Teenage McKeyla',
 'Quincy Jones',
 'Eclectic Marco Luque',
 'Bogotá',
 'Red Skull Iron Man Captain America',
 'Air John Cutter Charles Rane Rane',
 'South African',
 'Filmmaker Errol Morris',
 'Tessa H