<a href="https://colab.research.google.com/github/Nima-Nilchian/Keyword_extraction/blob/master/Keyword_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keyword Extraction

# استخراج عبارت‌های کلیدی متن

<p align="right">
ارزیابی روش‌ها مورد استفاده قرار خواهد گرفت. علاوه بر گزارش نتایج برای هر یک از روش‌ها با استفاده از مجموعه داده، نمونه‌هایی از خروجی را هم برای هر روش گزارش کرده و به صورت شهودی نیز مقایسه‌ای انجام دهید. همچنین سرعت روش‌های پیاده‌سازی شده نیز نیاز به مقایسه دارد.
</p>
<p align="right">
:روش‌های زیر را پیاده‌سازی و طبق روال بالا ارزیابی و مقایسه کنید
</p>

*   baseline (TF-IDF)
*   TF-IDF with Ngrams
*   TF-IDF with chunking
*   KP-Miner
*   Yake
*   TextRank
*   SingleRank
*   TopicRank
*   TopicalPageRank
*   PositionRank
*   MultipartiteRank
*   scake
*   sgrank











# Reading and preproccessing Data

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install perke
!python -m perke download

In [3]:
# import numpy as np
import pandas as pd
import json
import string
import perke

In [None]:
from google.colab import drive
drive.mount("./content")

In [None]:
data_loc = './content/MyDrive/datasets/ke/ke_dataset.txt'

df = pd.DataFrame()
with open(data_loc, 'r') as f:
  for line in f.readlines():
    json_data = json.loads(line.strip())
    df = pd.concat([df, pd.DataFrame([json_data])], ignore_index=True)

df.drop('id', axis=1, inplace=True)
all_texts = df['body'].tolist()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   body      450 non-null    object
 1   keywords  450 non-null    object
dtypes: object(2)
memory usage: 7.2+ KB


In [None]:
def ke_textRank(text, n=10):
  extractor = perke.unsupervised.graph_based.TextRank()

  extractor.load_text(text, word_normalization_method='lemmatization')
  extractor.weight_candidates(window_size=2, top_t_percent=0.33)

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)
  return list(candids)

def ke_topicRank(text, n=10):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.TopicRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='lemmatization')
  extractor.select_candidates()
  extractor.weight_candidates(
      threshold=0.74, metric='jaccard', linkage_method='average')

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)
  return list(candids)

def ke_singleRank(text, n=10):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.SingleRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='lemmatization')
  extractor.select_candidates()
  extractor.weight_candidates(window=10)

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)
  return list(candids)


def ke_positionRank(text, n):
  # Define the grammar for selecting the keyphrase candidates
  grammar = r"""
      NP:
          {<NOUN>}<VERB>
      NP:
          {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
          <NOUN>}{<.*(,EZ)?>
  """

  valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}
  extractor = perke.unsupervised.graph_based.PositionRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(
      input=text, word_normalization_method='stemming',
      universal_pos_tags=False,
  )
  extractor.select_candidates(grammar=grammar, maximum_word_number=3)
  extractor.weight_candidates(window_size=10)

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)
  return list(candids)


def ke_multipartiteRank(text, n):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.MultipartiteRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='stemming')
  extractor.select_candidates()
  extractor.weight_candidates(
      threshold=0.74,
      metric='jaccard',
      linkage_method='average',
      alpha=1.1,
  )

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)
  return list(candids)

In [None]:
topicRank_keywords = []
textRank_keywords = []
singleRank_keywords = []
positionRank_keywords = []
multi_keywords = []

n = 10
for i, text in enumerate(all_texts[:20]):

  topicRank_keywords.append(ke_topicRank(text, n))
  textRank_keywords.append(ke_textRank(text, n))
  singleRank_keywords.append(ke_textRank(text, n))
  positionRank_keywords.append(ke_positionRank(text, n))
  multi_keywords.append(ke_multipartiteRank(text, n))



In [None]:
def calculate_recall(candidates, references):
    return len(set(references) & set(candidates)) / len(set(references))

def calculate_percission(candidates, references):
  return len(set(references) & set(candidates) / len(candidates))

topicRank_recall, topic_percision= [], []
textRank_recall, text_percision = [], []
singleRank_recall, single_percision = [], []
positionRank_recall, position_percision = [], []
multi_recall, multi_percision = [], []

for i in range(len(topicRank_keywords)):
  references = df['keywords'].iloc[i]
  topicRank_recall.append(calculate_recall(topicRank_keywords[i], references))
  topic_percision.append(calculate_recall(topicRank_keywords[i], references))

  textRank_recall.append(calculate_recall(textRank_keywords[i], references))
  text_percision.append(calculate_recall(textRank_keywords[i], references))

  singleRank_recall.append(calculate_recall(singleRank_keywords[i], references))
  single_percision.append(calculate_recall(singleRank_keywords[i], references))

  positionRank_recall.append(calculate_recall(positionRank_keywords[i], references))
  position_percision.append(calculate_recall(positionRank_keywords[i], references))

  multi_recall.append(calculate_recall(multi_keywords[i], references))
  multi_percision.append(calculate_recall(multi_keywords[i], references))


print("Topic Rank recall score is:", topicRank_recall)
print("Topic Rank recall score is:", topic_percision)

print("Text Rank recall score is:", textRank_recall)
print("Text Rank recall score is:", text_percision)

print("Single Rank recall score is:", singleRank_recall)
print("Single Rank recall score is:", single_percision)

print("Position Rank recall score is:", positionRank_recall)
print("Position Rank recall score is:", position_percision)

print("Multipartite Rank recall score is:", multi_recall)
print("Multipartite Rank recall score is:", multi_percision)

NameError: ignored