<a href="https://colab.research.google.com/github/Nima-Nilchian/Keyword_extraction/blob/master/Keyword_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keyword Extraction

# استخراج عبارت‌های کلیدی متن

<p align="right">
ارزیابی روش‌ها مورد استفاده قرار خواهد گرفت. علاوه بر گزارش نتایج برای هر یک از روش‌ها با استفاده از مجموعه داده، نمونه‌هایی از خروجی را هم برای هر روش گزارش کرده و به صورت شهودی نیز مقایسه‌ای انجام دهید. همچنین سرعت روش‌های پیاده‌سازی شده نیز نیاز به مقایسه دارد.
</p>
<p align="right">
:روش‌های زیر را پیاده‌سازی و طبق روال بالا ارزیابی و مقایسه کنید
</p>

*   baseline (TF-IDF)
*   TF-IDF with Ngrams
*   TF-IDF with chunking
*   KP-Miner
*   Yake
*   TextRank
*   SingleRank
*   TopicRank
*   TopicalPageRank
*   PositionRank
*   MultipartiteRank
*   scake
*   sgrank











# Reading and preproccessing Data

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install perke
!python -m perke download

In [21]:
import numpy as np
import pandas as pd
import json
import string
import perke
import time
import multiprocessing

In [3]:
from google.colab import drive
drive.mount("./content")

Mounted at ./content


In [4]:
data_loc = './content/MyDrive/datasets/ke/ke_dataset.txt'

df = pd.DataFrame()
with open(data_loc, 'r') as f:
  for line in f.readlines():
    json_data = json.loads(line.strip())
    df = pd.concat([df, pd.DataFrame([json_data])], ignore_index=True)

df.drop('id', axis=1, inplace=True)
all_texts = df['body'].tolist()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   body      450 non-null    object
 1   keywords  450 non-null    object
dtypes: object(2)
memory usage: 7.2+ KB


In [23]:
def ke_topicRank(text):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.TopicRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='lemmatization')
  extractor.select_candidates()
  extractor.weight_candidates(
      threshold=0.74, metric='jaccard', linkage_method='average')

  keyphrases = extractor.get_n_best(n=10)
  candids, score = zip(*keyphrases)

  return list(candids)


def ke_textRank(text):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.TextRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(text, word_normalization_method='lemmatization')
  extractor.weight_candidates(window_size=2, top_t_percent=0.33)

  keyphrases = extractor.get_n_best(n=10)
  candids, score = zip(*keyphrases)

  return list(candids)


def ke_singleRank(text):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.SingleRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='lemmatization')
  extractor.select_candidates()
  extractor.weight_candidates(window=10)

  keyphrases = extractor.get_n_best(n=10)
  candids, score = zip(*keyphrases)

  return list(candids)


def ke_positionRank(text):
  # Define the grammar for selecting the keyphrase candidates
  grammar = r"""
      NP:
          {<NOUN>}<VERB>
      NP:
          {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
          <NOUN>}{<.*(,EZ)?>
  """
  valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}
  extractor = perke.unsupervised.graph_based.PositionRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(
      input=text, word_normalization_method='stemming',
      universal_pos_tags=False,
  )
  extractor.select_candidates(grammar=grammar, maximum_word_number=3)
  extractor.weight_candidates(window_size=10)

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)

  return list(candids)


def ke_multipartiteRank(text):
  valid_pos_tags = {'NOUN', 'ADJ'}
  extractor = perke.unsupervised.graph_based.MultipartiteRank(valid_pos_tags=valid_pos_tags)

  extractor.load_text(input=text, word_normalization_method='stemming')
  extractor.select_candidates()
  extractor.weight_candidates(
      threshold=0.74,
      metric='jaccard',
      linkage_method='average',
      alpha=1.1,
  )

  keyphrases = extractor.get_n_best(n=n)
  candids, score = zip(*keyphrases)

  return list(candids)

In [None]:
# Number of processes to run in parallel
num_processes = multiprocessing.cpu_count()

# Create a multiprocessing pool
pool = multiprocessing.Pool(processes=num_processes)

# Perform keyword extraction for each text using multiprocessing
results_topic = pool.map(ke_topicRank, all_texts[:25])
results_text = pool.map(ke_textRank, all_texts[:25])
results_single = pool.map(ke_singleRank, all_texts[:25])
results_position = pool.map(ke_positionRank, all_texts[:25])
results_multi = pool.map(ke_multipartiteRank, all_texts[:25])

# Close the pool of processes
pool.close()
pool.join()

In [53]:
def evaluate(candidates, references):
    r = len(set(references) & set(candidates)) / len(set(references))
    p = len(set(references) & set(candidates)) / len(set(candidates))

    if p == 0 or r == 0:
      return 0

    return (2*p*r) / (p+r)

In [59]:
topicRank= []
textRank = []
singleRank = []
positionRank = []
multi = []

for i in range(len(results_topic)):
  references = df['keywords'].iloc[i]
  topicRank.append(evaluate(results_topic[i], references))
  textRank.append(evaluate(results_text[i], references))
  singleRank.append(evaluate(results_single[i], references))
  positionRank.append(evaluate(results_position[i], references))
  multi.append(evaluate(results_multi[i], references))

print("Topic_Rank f1 score is:", topicRank)
print("Text_Rank f1 score is:", textRank)
print("Single_Rank f1 score is:", singleRank)
print("Position_Rank f1 score is:", positionRank)
print("Multipartite_Rank f1 score is:", multi)

Topic_Rank f1 score is: [0.125, 0, 0, 0.11111111111111112, 0.125, 0, 0.14285714285714288, 0.26666666666666666, 0.11111111111111112, 0.10000000000000002, 0.22222222222222224, 0, 0, 0.11764705882352941, 0.13333333333333333, 0.25, 0.125, 0.13333333333333333, 0.125, 0, 0, 0.2105263157894737, 0, 0.1904761904761905, 0]
Text_Rank f1 score is: [0, 0, 0.15384615384615383, 0.11111111111111112, 0.125, 0.14285714285714288, 0.14285714285714288, 0, 0.11111111111111112, 0.10000000000000002, 0.11111111111111112, 0, 0, 0, 0, 0, 0.125, 0.13333333333333333, 0.25, 0.13333333333333333, 0, 0, 0, 0, 0.26666666666666666]
Single_Rank f1 score is: [0, 0, 0.15384615384615383, 0.11111111111111112, 0.125, 0.28571428571428575, 0, 0, 0, 0.10000000000000002, 0.11111111111111112, 0, 0, 0, 0.26666666666666666, 0, 0.125, 0, 0.125, 0.13333333333333333, 0, 0, 0, 0.09523809523809525, 0.13333333333333333]
Position_Rank f1 score is: [0, 0, 0.15384615384615383, 0, 0, 0.14285714285714288, 0, 0.13333333333333333, 0.222222222222

In [65]:
topic_mean_f1 = np.mean(np.array(topicRank))
text_mean_f1 = np.mean(np.array(textRank))
single_mean_f1 = np.mean(np.array(singleRank))
position_mean_f1 = np.mean(np.array(positionRank))
multi_mean_f1 = np.mean(np.array(multi))

print("Topic Rank mean f1 score is:", topic_mean_f1.round(4))
print("Text Rank mean f1 score is:", text_mean_f1.round(4))
print("Single Rank mean f1 score is:", single_mean_f1.round(4))
print("Position Rank mean f1 score is:", position_mean_f1.round(4))
print("Multipartite Rank mean f1 score is:", multi_mean_f1.round(4))

Topic Rank mean f1 score is: 0.0996
Text Rank mean f1 score is: 0.0762
Single Rank mean f1 score is: 0.0706
Position Rank mean f1 score is: 0.0461
Multipartite Rank mean f1 score is: 0.1425
