# Keyword Extraction

In [1]:
import spacy
import yake
import nltk
import string
import numpy as np
import pandas as pd
from rake_nltk import Rake
from keybert import KeyBERT
from string import punctuation
from collections import Counter
from gensim.summarization import keywords

In [2]:
# Single paragraph (Para 2 of Ch 2)

text = 'Archaeologists have found some of the things hunter-gatherers made and used. It is likely that people made and used tools of stone, wood and bone, of which stone tools have survived best. Some of these stone tools were used to cut meat and bone, scrape bark (from trees) and hides (animal skins), chop fruit and roots. Some may have been attached to handles of bone or wood, to make spears and arrows for hunting. Other tools were used to chop wood, which was used as firewood. Wood was also used to make huts and tools.'
text = text.translate(str.maketrans('', '', string.punctuation))
file = "C:\\Users\\Viral Doshi\\Desktop\\Viral Academic\\IIT Bombay Intern\\Attempt 1\\Our_Past_1_Chapter2.txt"

### Method 1: Spacy Keyword Extraction: Grammar based Method
#### Extracting Certain parts of speech

In [3]:
# Method 1: Spacy Keyword Extraction 

def spacy_kw_ext(text):
    nlp = spacy.load("en_core_web_lg")
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN']
    doc = nlp(text.lower())
    for token in doc:
        if(token.text in nlp.Defaults.stop_words):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)

    final_kw =  [(x[0]) for x in Counter(result).most_common(10)]
    return final_kw

### Method 2: Yake Keyword Extraction: Statistical Method
##### Based on Statistical Features

In [4]:
# Method 2: Yake Keyword Extraction 

def yake_kw_ext(text):
    kw_extractor = yake.KeywordExtractor()
    final_kw = []
    language = "en"
    max_ngram_size = 1
    deduplication_threshold = 0.9
    numOfKeywords = 10
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords[::-1]:
        final_kw.append(kw[0])
    return final_kw

### Method 3: Rake NLTK Keyword Extraction
#### Frequency of Words

In [5]:
# Method 3: Rake NLTK Keyword Extraction

def rakenltk_kw_ext(text):
    rake_nltk_var = Rake(min_length=1, max_length=1)
    rake_nltk_var.extract_keywords_from_text(text)
    keyword_extracted = rake_nltk_var.get_ranked_phrases()
    return keyword_extracted[:10]

### Method 4: Gensim Keyword Extraction
#### Based on Textrank Algorithm

In [6]:
# Method 4: Gensim Keyword Extraction

def gensim_kw_ext(text):
    k = keywords(text, words = 10, lemmatize = True,).split('\n')
    final_kw = []
    for i in k:
        final_kw+=i.split(' ')
    return final_kw

### Method 5: KeyBert + distilbert base nli mean tokens
#### Using a Pre-trained Model with max-sum for ranking

In [7]:
# Method 5: KeyBert + distilbert base nli mean tokens

def KeyBert_distilbert(text):
    kw = []
    kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = kw_model.extract_keywords(text)
    temp = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,1), stop_words='english', use_maxsum=True, top_n=10, diversity=0.2, nr_candidates = 20)
    for i in temp:
        kw.append(i[0])
    return kw

### Method 6: KeyBert + paraphrase
#### Using another pre-trained Model + Max-sum + higher diversity measure

In [8]:
# Method 6: KeyBert + paraphrase

def KeyBert_paraphrase(text):
    kw = []
    kw_model = KeyBERT(model='xlm-r-distilroberta-base-paraphrase-v1')
    keywords = kw_model.extract_keywords(text)
    temp = kw_model.extract_keywords(text, keyphrase_ngram_range=(1,1), stop_words='english', use_maxsum=True, diversity=0.7, top_n = 10)
    for i in temp:
        kw.append(i[0])
    return kw

In [9]:
data = []
a = spacy_kw_ext(text)
b = yake_kw_ext(text)
c = rakenltk_kw_ext(text)
d = gensim_kw_ext(text)
e = KeyBert_distilbert(text)
f = KeyBert_paraphrase(text)
for i in range(10):
    data.append([a[i], b[i], c[i], d[i], e[i], f[i]])

### Visualizing all 6 Methods and checking their results on Para 2

In [10]:
df = pd.DataFrame(data, columns=["Spacy", "Yake", "RakeNLTK", "Gensim", "Keybert-distilbert", "Keybert-paraphrase"])
df.index = np.arange(1, len(df) + 1)
df = df.style.set_properties(**{'text-align': 'left'})

display(df)

Unnamed: 0,Spacy,Yake,RakeNLTK,Gensim,Keybert-distilbert,Keybert-paraphrase
1,tools,things,wood,scrape,fruit,bark
2,wood,found,used,stone,trees,used
3,stone,Archaeologists,trees,wood,stone,skins
4,bone,chop,tools,animal,arrows,trees
5,archaeologists,make,roots,skins,bone,arrows
6,things,made,may,chop,meat,cut
7,huntergatherers,bone,likely,things,hunting,firewood
8,likely,stone,hunting,huntergatherers,firewood,archaeologists
9,people,wood,handles,cut,archaeologists,hunting
10,meat,tools,found,meat,huntergatherers,huntergatherers


### --------- X ---------

### All are working pretty well ! Let's apply all these methods to Full Chapter

In [11]:
para_data = [[]]
para_name = ''
chapter_name = ''
EndOfChap = 0
with open(file, encoding="utf8") as f:
    for count, line in enumerate(f):
        if EndOfChap < 2:
            if 'CHAPTER' in line:
                chapter_name = line.split(':')[1].strip()
                continue
                
            if line == '\n':
                para_data.append([])
                EndOfChap +=1
                
            elif para_data[-1] == [] and para_name == '':
                para_name = line
                
            else:
                para_descr = line
                para_data[-1] = [chapter_name, para_name, para_descr]
                para_name = ''
                EndOfChap  = 0
                

para_data = [i for i in para_data if i != []]
for o_index in range(len(para_data)):
    for i_index in range(len(para_data[o_index])):
        para_data[o_index][i_index] = para_data[o_index][i_index].strip()

In [12]:
a = []
count = 0
sent_data = []
para_name = ''
chapter_name = ''
EndOfChap = 0
newpara = False

nlp = spacy.load("en_core_web_sm")
with open(file, encoding="utf8") as f:
    for line in f:
        a.append(line)

for index,line in enumerate(a):
    if EndOfChap < 2:
        if 'CHAPTER' in line:
            chapter_name = line.split(':')[1].strip()
            continue
            
        if line == '\n':
            para_name = ''
            newpara = True
            EndOfChap +=1
            continue
        
        if newpara and para_name == '':
            para_name = line.strip()
            newpara = False
            continue
            
        if not newpara:
            doc = nlp(a[index])
            for sent in doc.sents:
                fsent = str(sent).strip()
                sent_data.append([chapter_name,para_name,fsent])
            EndOfChap  = 0


In [13]:
df_sent = pd.DataFrame(sent_data, columns = ['Chapter Name', 'Paragraph Title', 'Sentence'])
df_para = pd.DataFrame(para_data, columns = ['Chapter Name', 'Paragraph Title', 'Description'])
df_sent.index = np.arange(1,len(df_sent)+1)
df_para.index = np.arange(1,len(df_para)+1)

In [14]:
df_para['Description'] = df_para['Description'].apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))

In [15]:
df_para['Spacy'] = df_para['Description'].apply(spacy_kw_ext)
df_para['Yake'] = df_para['Description'].apply(yake_kw_ext)
df_para['RakeNLTK'] = df_para['Description'].apply(rakenltk_kw_ext)
df_para['Gensim'] = df_para['Description'].apply(gensim_kw_ext)
df_para['Keybert-distilbert'] = df_para['Description'].apply(KeyBert_distilbert)
df_para['Keybert-paraphrase'] = df_para['Description'].apply(KeyBert_paraphrase)

## Let's have a look at the Final Dataframe with proper para-wise data and the Extracted Keywords from those paragraphs

In [16]:
df_para

Unnamed: 0,Chapter Name,Paragraph Title,Description,Spacy,Yake,RakeNLTK,Gensim,Keybert-distilbert,Keybert-paraphrase
1,FROM HUNTING-GATHERING TO GROWING FOOD,Tushar’s train journey,Tushar was going from Delhi to Chennai for his...,"[tushar, people, delhi, chennai, cousin, weddi...","[squeeze, managed, travelling, wedding, cousin...","[’, wedding, tushar, travelling, trains, train...","[people, fly, later, tushar, glass, pane, uncl...","[quickly, 150, trees, chennai, buses, trains, ...","[150, quickly, fly, buses, wondered, cousin, u..."
2,FROM HUNTING-GATHERING TO GROWING FOOD,The earliest people: why were they on the move ?,We know about people who lived in the subconti...,"[place, search, people, water, animals, plants...","[hunted, food, moved, huntergatherers, animals...","[would, way, water, summer, subcontinent, stay...","[animal, seasonal, people, wild, fruit, hunter...","[fruits, summer, rivers, bear, seasonal, eaten...","[eggs, trees, reasons, moved, stayed, movement..."
3,FROM HUNTING-GATHERING TO GROWING FOOD,How do we know about this place ?,Archaeologists have found some of the things h...,"[tools, wood, stone, bone, archaeologists, thi...","[things, found, Archaeologists, chop, make, ma...","[wood, used, trees, tools, roots, may, likely,...","[scrape, stone, wood, animal, skins, chop, cut...","[fruit, trees, stone, arrows, bone, meat, hunt...","[bark, used, skins, trees, arrows, cut, firewo..."
4,FROM HUNTING-GATHERING TO GROWING FOOD,Choose a place to live in,Look at Map 2 below All the places marked with...,"[places, map, sites, stone, red, triangles, ar...","[found, archaeologists, triangles, red, marked...","[water, sites, shown, rivers, places, many, lo...","[near, people, quality, stone, hunter, gathere...","[easily, triangles, important, red, rivers, la...","[important, shown, red, gatherers, lakes, ston..."
5,FROM HUNTING-GATHERING TO GROWING FOOD,Rock paintings and what they tell us,Many of the caves in which these early people ...,"[paintings, pradesh, caves, early, people, wal...","[walls, lived, people, early, caves, skill, pa...","[walls, skill, paintings, many, caves]","[animals, pradesh, uttar, best, examples, grea...","[early, pradesh, uttar, best, southern, accura...","[lived, best, southern, early, drawn, accuracy..."
6,FROM HUNTING-GATHERING TO GROWING FOOD,Sites,Sites are places where the remains of things t...,"[sites, earth, places, remains, things, tools,...","[buildings, pots, tools, things, remains, plac...","[water, surface, sites, remains, places, peopl...","[chapters, sites, different, things, tools, po...","[chapters, learn, water, earth, different, bui...","[people, learn, left, different, used, pots, c..."
7,FROM HUNTING-GATHERING TO GROWING FOOD,Finding out about fire,Find the Kurnool caves on Map 2 page13 Traces ...,"[fire, kurnool, caves, map, page13, traces, as...","[suggests, found, ash, caves, today, Find, Kur...","[used, use, today, suggests, source, people, l...","[page, kurnool, caves, roast, meat, scare, awa...","[source, today, away, light, scare, page13, an...","[away, today, map, suggests, animals, page13, ..."
8,FROM HUNTING-GATHERING TO GROWING FOOD,Names and dates,Archaeologists have given lengthy names for th...,"[years, names, time, period, stone, tools, pal...","[places, called, Neolithic, period, tools, tim...","[wood, use, time, think, studying, sickles, sa...","[means, stone, period, middle, long, changes, ...","[studying, greek, lengthy, tools, bone, 10000,...","[changes, finds, microliths, lengthy, 12000, p..."
9,FROM HUNTING-GATHERING TO GROWING FOOD,A changing environment,Around 12000 years ago there were major change...,"[animals, years, major, changes, climate, worl...","[shift, world, climate, major, ago, important,...","[world, survived, shift, rearing, number, like...","[warm, antelope, goat, food, habits, start, th...","[years, seasons, warm, habits, 12000, goat, an...","[followed, antelope, turn, shift, 12000, seaso..."
10,FROM HUNTING-GATHERING TO GROWING FOOD,The beginning of farming and herding,This was also a time when several grain bearin...,"[animals, people, food, animal, wild, dog, tim...","[animal, Men, lived, wild, food, grew, dog, ta...","[time, think, tamed, shelters, ripened, reason...","[people, grass, animal, grew, probably, goat, ...","[shelters, women, dog, pig, goat, rice, barley...","[wild, wheat, bearing, camps, plants, ancestor..."


In [17]:
df_para.to_csv('Chapter2_keywords.csv')

###### -- Viral Doshi