In [4]:
import pandas as pd
import numpy as np
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
import re
import math
from collections import Counter

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True)
df = dataset[["category","headline","short_description"]]
df.head(5)

print(np.unique(np.array(df["category"]))) # 42 categories

['ARTS' 'ARTS & CULTURE' 'BLACK VOICES' 'BUSINESS' 'COLLEGE' 'COMEDY'
 'CRIME' 'CULTURE & ARTS' 'DIVORCE' 'EDUCATION' 'ENTERTAINMENT'
 'ENVIRONMENT' 'FIFTY' 'FOOD & DRINK' 'GOOD NEWS' 'GREEN' 'HEALTHY LIVING'
 'HOME & LIVING' 'IMPACT' 'LATINO VOICES' 'MEDIA' 'MONEY' 'PARENTING'
 'PARENTS' 'POLITICS' 'QUEER VOICES' 'RELIGION' 'SCIENCE' 'SPORTS' 'STYLE'
 'STYLE & BEAUTY' 'TASTE' 'TECH' 'THE WORLDPOST' 'TRAVEL' 'WEDDINGS'
 'WEIRD NEWS' 'WELLNESS' 'WOMEN' 'WORLD NEWS' 'WORLDPOST']


In [6]:
df = dataset[["category","headline","short_description"]]

# find not alphabet in "headline" column
foundNotAlphaHeadline = []
for row in df["headline"]:
  foundNotAlphaHeadline = foundNotAlphaHeadline + re.findall("[^a-zA-Z0-9 ]",row)

foundNotAlphaHeadline = np.unique(np.array(foundNotAlphaHeadline))
print("Not Alphabet in 'Headline' column:\n",foundNotAlphaHeadline)

Not Alphabet in 'Headline' column:
 ['\t' '!' '"' '#' '$' '%' '&' "'" '(' ')' '*' '+' ',' '-' '.' '/' ':' ';'
 '=' '>' '?' '@' '[' '\\' ']' '^' '_' '{' '|' '}' '~' '\x7f' '\xa0' '¡'
 '¢' '£' '©' '«' '\xad' '®' '¯' '°' '´' '·' '½' '¿' 'À' 'Ç' 'É' 'Ñ' 'Ü'
 'à' 'á' 'â' 'ã' 'ä' 'å' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï' 'ñ' 'ó' 'ô'
 'ö' 'ø' 'ù' 'ú' 'û' 'ü' 'ā' 'ć' 'Č' 'ğ' 'œ' 'ś' 'ū' 'ů' 'ž' 'ǒ' 'ș' 'ʻ'
 'ʼ' 'Έ' 'Κ' 'ά' 'ί' 'α' 'γ' 'δ' 'ε' 'η' 'ι' 'κ' 'λ' 'μ' 'ν' 'ξ' 'ο' 'ρ'
 'ς' 'σ' 'τ' 'υ' 'ω' 'ό' 'ύ' 'ώ' 'ᴥ' 'ᵒ' 'ᶅ' 'ᶘ' 'ạ' 'ấ' 'ễ' '\u2009'
 '\u200a' '\u200b' '\u200e' '\u200f' '‐' '–' '—' '―' '‘' '’' '“' '”' '…'
 '\u2028' '′' '™' '⊙' '◕' 'ツ' '︿' '\ufeff' 'ﾩ' 'ﾫ' 'ￃ' '🌮' '🌷' '🌿' '🍂' '🍃'
 '💯']


In [7]:
# find not alphabet in "short_description" column
foundNotAlphaDesc = []
for row in df["short_description"]:
  foundNotAlphaDesc = foundNotAlphaDesc + re.findall("[^a-zA-Z0-9 ]",row)

foundNotAlphaDesc = np.unique(np.array(foundNotAlphaDesc))
print("Not Alphabet in 'short_description' column:\n",foundNotAlphaDesc)

Not Alphabet in 'short_description' column:
 ['\t' '\n' '!' '"' '#' '$' '%' '&' "'" '(' ')' '*' '+' ',' '-' '.' '/' ':'
 ';' '=' '>' '?' '@' '[' '\\' ']' '^' '_' '`' '{' '|' '}' '~' '\x80'
 '\x93' '\x94' '\x99' '\xa0' '¡' '£' '©' '¬' '\xad' '®' '¯' '°' '±' '´'
 '·' '¹' 'º' '¼' '½' '¾' '¿' 'Á' 'Å' 'Ç' 'É' 'Î' 'Ö' '×' 'Ø' 'Ü' 'à' 'á'
 'â' 'ã' 'ä' 'å' 'æ' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï' 'ñ' 'ò' 'ó' 'ô'
 'ö' 'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'ā' 'ć' 'Č' 'č' 'ē' 'ė' 'ğ' 'ī' 'Ł' 'ł' 'ń'
 'ō' 'œ' 'Ş' 'ş' 'š' 'ū' 'ű' 'ž' 'ș' 'ə' 'ɛ' 'ɪ' 'ʼ' 'ˈ' 'ˌ' 'ː' '˚' '̀'
 '́' 'η' 'Г' 'Ж' 'П' 'а' 'б' 'в' 'г' 'е' 'з' 'и' 'к' 'л' 'м' 'н' 'о' 'п'
 'р' 'с' 'т' 'у' 'ф' 'х' 'ы' 'ь' 'я' 'ِ' 'আ' 'ই' 'গ' 'ট' 'ভ' 'র' 'ল' 'স'
 'া' 'ি' 'ো' '্' 'ಠ' 'ầ' 'ễ' '\u200a' '\u200b' '\u200d' '\u200e' '\u200f'
 '‐' '‒' '–' '—' '―' '‘' '’' '‚' '“' '”' '•' '…' '\u2028' '\u202a'
 '\u202c' '\u202f' '′' '″' '‹' '›' '€' '™' '≠' '②' '─' '☀' '☂' '☃' '☔' '☕'
 '☘' '☼' '♂' '♥' '♦' '♫' '♬' '⚡' '⚾' '⛄' '⛱' '⛽' '✂' '✈' '✊' '✌' '✔' '✨'
 '❄' '❤' '➡'

In [38]:
def removeNotAlpha(sentence, column):
  if (column == "headline"):
    for na in foundNotAlphaHeadline:
      sentence = sentence.replace(na,"")

  elif (column == "short_description"):
    for na in foundNotAlphaDesc:
      sentence = sentence.replace(na,"")
  
  elif (column == "query"):
    for na in foundNotAlphaDesc:
      sentence = sentence.replace(na,"")

  return sentence

def toLower(sentence):
  return sentence.lower()

apostrophes = ["’s","’ll","’re","’m","n’t","’d","'s","'ll","'re","'m","n't","'d"]
def removeApostrophes(sentence):
  for ap in apostrophes:
    sentence = sentence.replace(ap,"")
  return sentence

def getWordNetPOS(term):
  tag = nltk.pos_tag([term])[0][1][0].upper()
  tag_dict = {
      "J": wordnet.ADJ,
      "N": wordnet.NOUN,
      "V": wordnet.VERB,
      "R": wordnet.ADV}

  return tag_dict.get(tag, wordnet.NOUN)

pattern = re.compile("[[a-zA-z0-9]*]?[0-9][[a-zA-z0-9]*]?")
lemmatizer = WordNetLemmatizer()
def lemmatization(sentence):
  words = sentence.split()
  lemmatized_word = list(map(lambda word: lemmatizer.lemmatize(word, getWordNetPOS(word)) if (pattern.match(word) == None) else word, words))
  return " ".join(lemmatized_word)

In [10]:
for idx in range(df.shape[0]):
  headl = df["headline"][idx]
  desc = df["short_description"][idx]

  headl = removeNotAlpha(headl, "headline")
  desc = removeNotAlpha(desc, "short_description")

  headl = toLower(headl)
  desc = toLower(desc)
  
  headl = removeApostrophes(headl)
  desc = removeApostrophes(desc)

  headl = lemmatization(headl)
  desc = lemmatization(desc)

  df["category"][idx] = toLower(df["category"][idx])
  df["headline"][idx] = headl
  df["short_description"][idx] = desc

df.head(8)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,category,headline,short_description
0,crime,there be 2 mass shooting in texas last week bu...,she left her husband he kill their child just ...
1,entertainment,will smith join diplo and nicky jam for the 20...,of course it have a song
2,entertainment,hugh grant marries for the first time at age 57,the actor and his longtime girlfriend anna ebe...
3,entertainment,jim carrey blast castrato adam schiff and demo...,the actor give dems an asskicking for not figh...
4,entertainment,julianna margulies us donald trump poop bag to...,the dietland actress say use the bag be a real...
5,entertainment,morgan freeman devastate that sexual harassmen...,it be not right to equate horrific incident of...
6,entertainment,donald trump be lovin new mcdonalds jingle in ...,it catchy all right
7,entertainment,what to watch on amazon prime thats new this week,there a great miniseries join this week


In [22]:
data = pd.DataFrame({
    "headline": df["headline"],
     "body": df["category"] +" "+df["headline"]+" "+df["short_description"]})

data.head(5)

Unnamed: 0,headline,body
0,there be 2 mass shooting in texas last week bu...,crime there be 2 mass shooting in texas last w...
1,will smith join diplo and nicky jam for the 20...,entertainment will smith join diplo and nicky ...
2,hugh grant marries for the first time at age 57,entertainment hugh grant marries for the first...
3,jim carrey blast castrato adam schiff and demo...,entertainment jim carrey blast castrato adam s...
4,julianna margulies us donald trump poop bag to...,entertainment julianna margulies us donald tru...


In [41]:
# Input Query
query = "crime in texas"

q_data = removeNotAlpha(query,"query")
q_data = toLower(q_data)
q_data = removeApostrophes(q_data)
q_data = lemmatization(q_data)

In [42]:
def countTF(term):
  tf = {}
  idx = 0
  for doc in data.iloc():
    body = doc["body"].split()
    tf[doc["headline"]] = body.count(term) + 1 # +1 to avoid zero division

  return tf

def countDF(term):
  count = 0
  for doc in data["body"]:
    if((term in doc) == True):
      count = count + 1
  return count

def countDocLength():
  lengths = {}
  for doc in data.iloc():
    lengths[doc["headline"]] = len(doc["body"].split())
  
  return lengths

# calculate BM23 score for each term
def calculateEachScore(term):
  docNum = data.shape[0]
  docLength = countDocLength()
  docAvgLength = sum(list(docLength.values()))/len(list(docLength.values()))
  tf = countTF(term)
  df = countDF(term)
  idf = math.log10((docNum - df + 0.5)/(df + 0.5))
  k = 2
  b = 0.75

  score = {}
  for doc in data.iloc():
    score[doc["headline"]] = ((tf[doc["headline"]] * (k + 1)) / (tf[doc["headline"]] + k * (1 - b + b * docLength[doc["headline"]] / docAvgLength))) * idf

  return score

# calculate BM23 score for each body
def calculateScore(query):
  terms = query.split()
  score = {}
  for term in terms:
    score = dict(Counter(score) + Counter(calculateEachScore(term)))
  
  return score

In [43]:
# sort output by score descending
output = sorted(calculateScore(q_data).items(), key=lambda item: item[1], reverse=True)

In [44]:
# Results
output = pd.DataFrame(output, columns = ["News", "BM25 Score"])
output.head(6)

Unnamed: 0,News,BM25 Score
0,texas serial shooter wont stop,7.353313
1,texas prisoner still face deadly heat report,7.125399
2,hundred of inmate relocate after texas prison ...,7.016326
3,texas massacre suspect have previous domestic ...,7.016326
4,fatal alligator attack in texas,6.99781
5,texas executes first inmate with new batch of ...,6.910348
