<h2>IMPORTS & LIBRARIES</h2>

In [1]:
import json
import re
from pprint import pprint
import nltk
import spacy
# nltk.download('all')
# nltk.download('punkt')
# nltk.download('stopwords')
# from nltk.corpus import stopwords
from nltk.stem import *
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import string
from rank_bm25 import BM25Okapi
from itertools import chain
import pandas as pd
import random
from transformers import T5Tokenizer, T5ForConditionalGeneration

<h2>LOAD DATA</h2>

<h3>Validation Data Input</h3>

In [2]:
import json

# Load the JSON data
with open("./DATASET/validationI.json") as f:
    data = json.load(f)
    # data = data_all[:200]

<h2>ARTICLES</h2>

In [3]:
# Extract articles from the input data
articles = []
articles_id = []
for item in data:
    article_text = item["input"].split("article: ")[-1].strip()
    ids = item["id"]
    articles.append(article_text)
    articles_id.append(ids)

# Print the list of articles
# print(articles_id)
# print(articles)

<h2>USER MODEL</h2>

In [4]:
profiles_list = []

for item in data:
    profile_texts = [profile["text"] for profile in item["profile"]]
    profiles_list.append(profile_texts)

for indices in profiles_list[:2]:
    print(indices)
# print(profiles_list[0])

['The Electoral College will cast its vote for President-elect Trump in December, and he will be sworn into office in January.  He will then be president, but many Americans are having trouble accepting that fact.', 'Republican presidential nominee Donald Trump has been a disgrace to his party and an embarrassment to the nation.  He has consistently demonstrated that he does not have the temperament, judgment, background or humility to lead our country.', 'Clinton has been a polarizing figure throughout her career. In her Roosevelt Island speech, she portrayed herself as a fighter. But, ultimately, Clinton will have to be more publicly accountable for some of the legitimate questions that have been raised around her candidacy because they are not going away.', "The U.S. has the most powerful military in the history of the world, but it should not be utilized as a political tool or for retribution. The government and its leaders must do their best to make the right decisions, to be trut

<h2>TEXT PREPROCESSING</h2>

<h3>Articles Preprocessing</h3>

In [5]:
#Lowercase, Tokenization, Normalization, Stopping, Stemming

articleslen = len(articles)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
token_queries = []

for i in range(0, articleslen):
    text = articles[i]
    text = text.lower()
    tokens = re.split('\W+', text)
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token != '']
    token_queries.append(tokens)

for indices in token_queries[:3]:
    print(indices)
# print(token_queries[1])

['call', 'presidenti', 'impeach', 'cast', 'shadow', 'modern', 'day', 'presid', 'howev', 'choru', 'impeach', 'seem', 'louder', 'past', 'year']
['less', '10', 'week', 'go', 'midterm', 'congression', 'elect', 'american', 'gener', 'frustrat', 'washington', 'nation', 'poll', 'show', 'three', 'quarter', 'american', 'disapprov', 'way', 'congress', 'job', 'much', 'stake', 'come', 'elect', 'day', 'time', 'elig', 'voter', 'stay', 'home']
['u', 'power', 'militari', 'histori', 'world', 'util', 'polit', 'tool', 'retribut', 'govern', 'leader', 'must', 'best', 'make', 'right', 'decis', 'truth', 'american', 'peopl', 'provid', 'necessari', 'support', 'need', 'fulfil', 'militari', 'mission', 'unfortun', 'alway', 'case']


<h3>Profile Docs Preprocessing</h3>

In [6]:
profilelen = len(profiles_list)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
token_profiles = []

for i in range(profilelen):
    profile = profiles_list[i]
    profile_tokens = []
    for text in profile:
        text = text.lower()
        tokens = re.split('\W+', text)
        tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token != '']
        profile_tokens.append(tokens)
    token_profiles.append(profile_tokens)

for indices in token_profiles[:2]:
    print(indices)
# print(token_profiles[0])

[['elector', 'colleg', 'cast', 'vote', 'presid', 'elect', 'trump', 'decemb', 'sworn', 'offic', 'januari', 'presid', 'mani', 'american', 'troubl', 'accept', 'fact'], ['republican', 'presidenti', 'nomine', 'donald', 'trump', 'disgrac', 'parti', 'embarrass', 'nation', 'consist', 'demonstr', 'tempera', 'judgment', 'background', 'humil', 'lead', 'countri'], ['clinton', 'polar', 'figur', 'throughout', 'career', 'roosevelt', 'island', 'speech', 'portray', 'fighter', 'ultim', 'clinton', 'publicli', 'account', 'legitim', 'question', 'rais', 'around', 'candidaci', 'go', 'away'], ['u', 'power', 'militari', 'histori', 'world', 'util', 'polit', 'tool', 'retribut', 'govern', 'leader', 'must', 'best', 'make', 'right', 'decis', 'truth', 'american', 'peopl', 'provid', 'necessari', 'support', 'need', 'fulfil', 'militari', 'mission', 'unfortun', 'alway', 'case'], ['unconvent', 'announc', 'video', 'indic', 'time', 'differ', '18', 'month', 'go', '2016', 'elect', 'load', 'field', 'republican', 'candid', 'at

<h2>BUILD CORPUS</h2>

In [7]:
articles_corpus = list(chain(*token_queries))
userprofile_corpus = list(chain.from_iterable(chain(*token_profiles)))

# print(articles_corpus[0])
# print(userprofile_corpus[0])

<h2>BM25 RETRIEVAL</h2>

<h3>Document Retrieval</h3>

In [8]:
retrieval_list = []
for i in range(0, profilelen):
    corpus = profiles_list[i]
    tokenized_corpus = token_profiles[i]
    tokenized_query = token_queries[i]
    bm25 = BM25Okapi(tokenized_corpus)
    doc_scores = bm25.get_scores(tokenized_query)
    retrieval_list.append( bm25.get_top_n(tokenized_query, corpus, n=1) ) #top k: top 1 (here n)

for i in retrieval_list[:5]:
    print(i)

['The Electoral College will cast its vote for President-elect Trump in December, and he will be sworn into office in January.  He will then be president, but many Americans are having trouble accepting that fact.']
['The Electoral College will cast its vote for President-elect Trump in December, and he will be sworn into office in January.  He will then be president, but many Americans are having trouble accepting that fact.']
["President Putin's ultimate ambitions are not known, though it is clear he is using the seizure of Crimea and threats against Ukraine in part to strengthen his position at home.  Russia's economy is struggling, and government is riddled with corruption and cronyism."]
['He won the Pulitzer Prize, the National Book Award and the National Book Circle Critics Award all in the same year.']
['Pres. Donald Trump was briefed on the attack after his trip to visit victims of the Las Vegas shooting.']


<h3>Category Retrieval</h3>

In [9]:
# Function to find the category for a given input string
def find_category(input_text):
    for entry in data:
        for profile_entry in entry.get('profile', []):
            if profile_entry.get('text', '').lower() == input_text.lower():
                return profile_entry.get('category', 'Category not found')
    return 'Category not found'

# Find categories for each string in the retrieval list
flatRL = list(chain(*retrieval_list))
category_list = []
for input_text in flatRL:
    category = find_category(input_text)
    category_list.append(category)
    # print(f'Retrieval List: "{input_text}" | Category: {category}')

# for i in category_list:
#     print(i)
print(category_list)

['politics', 'politics', 'politics', 'culture & arts', 'politics', 'travel', 'entertainment', 'travel', 'entertainment', 'style & beauty', 'entertainment', 'entertainment', 'style & beauty', 'sports', 'education', 'women', 'women', 'healthy living', 'women', 'women', 'style & beauty', 'women', 'travel', 'women', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'entertainment', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'politics', 'entertainment', 'entertainment', 'science & technology', 'entertainment', 'women', 'entertainment', 'parents', 'entertainment', 'entertainment', 'entertainment', 'politics', 'entertainment', 'entertainment', 'entert

<h2>PROMPT ENGINEERING</h2>

In [10]:
def Aggregated_Input_Prompt(profilelen, articles, retrieval_list, category_list, category_options):
    prompt_list = []

    for i in range(profilelen):
        query = articles[i]
        top_documents = retrieval_list[i]
        category = category_list[i]

        prompt = f"""The category for the article: '{top_documents}' is '{category}'\nCurrent Article: {query}\nTASK: Given a current article above and the top relevant documents retrieved, determine the most relevant category from the options provided. Please respond with the category name only. Categories: [travel, style & beauty, food & drink, sports, business, science & technology, education, politics, religion, crime, parents, women, healthy living, entertainment, culture & arts] ?"""
        prompt_list.append(prompt)

    return prompt_list

profilelen = len(profiles_list)
category_options = ["travel", "style & beauty", "food & drink", "sports", "business", "science & technology", "education", "politics", "religion", "crime", "parents", "women", "healthy living", "entertainment", "culture & arts"]

prompt_list = Aggregated_Input_Prompt(profilelen, articles, retrieval_list, category_list, category_options)

for prompt in prompt_list:
    print(prompt)


The category for the article: '['The Electoral College will cast its vote for President-elect Trump in December, and he will be sworn into office in January.  He will then be president, but many Americans are having trouble accepting that fact.']' is 'politics'
Current Article: Calls for presidential impeachment have cast a shadow over most modern-day presidents. However, the chorus of impeachers seems louder in the past year.
TASK: Given a current article above and the top relevant documents retrieved, determine the most relevant category from the options provided. Please respond with the category name only. Categories: [travel, style & beauty, food & drink, sports, business, science & technology, education, politics, religion, crime, parents, women, healthy living, entertainment, culture & arts] ?
The category for the article: '['The Electoral College will cast its vote for President-elect Trump in December, and he will be sworn into office in January.  He will then be president, but

In [11]:
df_data = []

for i in range(profilelen):
    aids = articles_id[i]
    query = articles[i]
    top_documents = retrieval_list[i]
    category = category_list[i]
    prompts = prompt_list[i]
    for document in top_documents:
        df_data.append([aids, query, document, category, prompts])

columns = ["ID", "Article", "Top Document","Category","Prompt"]
df = pd.DataFrame(df_data, columns=columns)

# Display the DataFrame
display(df.head(20))

Unnamed: 0,ID,Article,Top Document,Category,Prompt
0,110,Calls for presidential impeachment have cast a...,The Electoral College will cast its vote for P...,politics,The category for the article: '['The Electoral...
1,111,With less than 10 weeks to go before the midte...,The Electoral College will cast its vote for P...,politics,The category for the article: '['The Electoral...
2,112,The U.S. has the most powerful military in the...,President Putin's ultimate ambitions are not k...,politics,"The category for the article: '[""President Put..."
3,113,"William Strampel faces multiple charges, the s...","He won the Pulitzer Prize, the National Book A...",culture & arts,The category for the article: '['He won the Pu...
4,114,The teenager allegedly used an assault rifle t...,Pres. Donald Trump was briefed on the attack a...,politics,The category for the article: '['Pres. Donald ...
5,115,He revealed he wants to work with Justin Biebe...,This list will make you want to catch the next...,travel,The category for the article: '['This list wil...
6,116,"""I can just imagine all the little girls who h...",There may be exciting things to come for Eleve...,entertainment,The category for the article: '['There may be ...
7,117,"""I feel like people make such a big deal out o...",This list will make you want to catch the next...,travel,The category for the article: '['This list wil...
8,118,Yet another clue that lots of Harry music is d...,It's a bird! It's a plane! It's ... Harry Styl...,entertainment,"The category for the article: '[""It's a bird! ..."
9,119,As well as a time jump and new details about t...,Why only be a mermaid for Halloween when you c...,style & beauty,The category for the article: '['Why only be a...


<h2>LARGE LANGUAGE MODEL</h2>

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

final_cats = []

for everyprompt in prompt_list:
    input_text = everyprompt
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = model.generate(input_ids)
    final_cats.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

print(final_cats)

<h2>EVALUATION</h2>

In [13]:
eval_list = []

for index,row in df.iterrows():
    input_text = row["Prompt"]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    
    outputs = model.generate(input_ids)
    final_cats = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    jsondata = {}
    jsondata["id"] = str(row["ID"])
    jsondata["output"] = final_cats
    eval_list.append(jsondata)

eval_json = { "task" : "LaMP_2", "golds" : eval_list }

# make an Eval JSON same as req JSON
genJSON = json.dumps(eval_json, indent=2)
loc = "./DATASET/evalO.json"
with open(loc, "w") as jf:
    jf.write(genJSON)

<b>Using below command we can evaluate the Language Model.</b>
> python eval_task.py --golds_json validationO.json --preds_json evalO.json --task_name LaMP_2 --output_file metrics.txt

Upon evaluating, I have received the following output
{"accuracy": 0.6511406844106464, "f1": 0.47087643223840003}