<a href="https://colab.research.google.com/github/Sayed-Ali-Raza-Naqvi/CodexCue_Keyword-Extraction_Project/blob/main/CodexCue_Keyword_Extraction_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keyword Extraction

In [None]:
import pandas as pd
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
try:
    df = pd.read_csv('/content/papers.csv', engine='python', on_bad_lines='skip', quoting=csv.QUOTE_MINIMAL, escapechar='\\')
except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")

In [None]:
df.sample(5)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
23846,pruning weights or eliminating biases. Vhcn a ...,vd }[~:1,,,,,
12751,4253,2011,Signal Estimation Under Random Time-Warpings a...,,4253-signal-estimation-under-random-time-warpi...,While signal estimation under random amplitude...,Signal Estimation Under Random Time-Warpings\n...
9088,2315,2002,Bayesian Image Super-Resolution,,2315-bayesian-image-super-resolution.pdf,Abstract Missing,Bayesian Image Super-Resolution\n\nMichael E. ...
14542,4964,2013,Projecting Ising Model Parameters for Fast Mixing,Poster,4964-projecting-ising-model-parameters-for-fas...,Inference in general Ising models is difficult...,Projecting Ising Model Parameters for Fast Mix...
9941,2786,2005,Oblivious Equilibrium: A Mean Field Approximat...,,2786-oblivious-equilibrium-a-mean-field-approx...,Abstract Missing,Oblivious Equilibrium: A Mean Field\nApproxima...


In [None]:
df_cleaned = df.dropna(subset=['paper_text'])

In [None]:
df.shape

(25667, 7)

In [None]:
df_cleaned.isnull().sum()

id               0
year             0
title            0
event_type    4778
pdf_name         0
abstract         0
paper_text       0
dtype: int64

In [None]:
df_cleaned.shape

(7217, 7)

## Preprocessing Data

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
user_defined_stopwords = ['fig', 'figure', 'result', 'using', 'show', 'large',
                          'one', 'two', 'three', 'four', 'five', 'six', 'seven',
                          'eight', 'nine', 'also']
stop_words = list(stop_words.union(user_defined_stopwords))

In [None]:
stemmer = PorterStemmer()

In [None]:
def text_preprocessing(text):
  text = text.lower()
  text = re.sub(r'<.*?>', ' ', text)
  text = re.sub(r'[^a-zA-Z]', ' ', text)
  text = nltk.word_tokenize(text)
  text = [word for word in text if word not in stop_words and len(word) > 3]
  text = [stemmer.stem(word) for word in text]

  return ' '.join(text)

In [None]:
docs = df_cleaned['paper_text'].apply(lambda x: text_preprocessing(x))

## Count Vectorizer
CountVectorizer in NLP is like a word counter. It converts text documents into a matrix where rows represent documents, columns represent unique words, and the cell values represent the frequency of each word. For example, given the documents "I love NLP" and "NLP is amazing", CountVectorizer would create a matrix with rows [1, 1] and [1, 1] for "I love NLP" and "NLP is amazing" respectively, with columns for each unique word and counts for occurrences.


## TF-IDF Transformer
TF-IDF Transformer in NLP transforms a count matrix from CountVectorizer into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features, which reflects the importance of each word in a document relative to its frequency across the corpus, thereby providing more meaningful features for text analysis and machine learning tasks. For example, given the count matrix [[1, 1], [1, 1]] from CountVectorizer, TF-IDF Transformer would compute TF-IDF scores for each word, considering both its frequency in the document and its rarity across the corpus.

In [None]:
count_vectorizer = CountVectorizer(max_df=0.95, max_features=5000, ngram_range=(1,3))
word_count_vector = count_vectorizer.fit_transform(docs)

In [None]:
transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
transformer = transformer.fit(word_count_vector)

## Keyword Extraction

In [None]:
feature_names = count_vectorizer.get_feature_names_out()

In [None]:
def get_keywords(idx, doc, num_keywords=10):
  doc_word_count = transformer.transform(count_vectorizer.transform([docs[idx]]))
  doc_word_count = doc_word_count.tocoo()
  tuples = zip(doc_word_count.col, doc_word_count.data)
  sorted_tuples = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

  sorted_tuples = sorted_tuples[:num_keywords]

  score_values = []
  feature_values = []

  for idx, score in sorted_tuples:
    score_values.append(round(score, 3))
    feature_values.append(feature_names[idx])

  results = {}

  for idx in range(len(feature_values)):
    results[feature_values[idx]] = score_values[idx]

  return results

In [None]:
def print_keywords(idx, keywords, df):
  print('==========Title==========')
  print(df['title'][idx])
  print('==========Abstract==========')
  print(df['abstract'][idx])
  print('==========Keywords==========')
  for keyword in keywords:
    print(keyword, keywords[keyword])

In [None]:
import pickle
pickle.dump(count_vectorizer, open('count_vectorizer.pkl', 'wb'))
pickle.dump(transformer, open('transformer.pkl', 'wb'))
pickle.dump(feature_names, open('feature_names.pkl', 'wb'))