In [12]:
# Dataset: https://huggingface.co/datasets/multi_news
%run -i "../util/lang_utils.ipynb"

In [13]:
from datasets import load_dataset
from nltk import word_tokenize
from math import ceil
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords


In [14]:
train_dataset = load_dataset("multi_news", split="train")
test_dataset = load_dataset("multi_news", split="test")
train_df = train_dataset.to_pandas()
test_df = test_dataset.to_pandas()
print(train_df)

                                                document  \
0      National Archives \n \n Yes, it’s that time ag...   
1      LOS ANGELES (AP) — In her first interview sinc...   
2      GAITHERSBURG, Md. (AP) — A small, private jet ...   
3      Tucker Carlson Exposes His Own Sexism on Twitt...   
4      A man accused of removing another man's testic...   
...                                                  ...   
44967  More than 670,000 copies of the Pearls’ self-p...   
44968  Seeking out cost-conscious consumers who have ...   
44969  Click to email this to a friend (Opens in new ...   
44970  BARRINGTON, R.I. (AP) — Women clad in yoga pan...   
44971  Based on a ‘real’ story, the hit John Travolta...   

                                                 summary  
0      – The unemployment rate dropped to 8.2% last m...  
1      – Shelly Sterling plans "eventually" to divorc...  
2      – A twin-engine Embraer jet that the FAA descr...  
3      – Tucker Carlson is in deep doodoo w

In [15]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.95)
vectorizer.fit(train_df["summary"])

In [16]:
def sort_coo(coo_matrix): #https://kavita-ganesan.com/python-keyword-extraction/
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)    

In [17]:
def get_keyword_strings(vectorizer, num_words, sorted_vector):
    words = []
    index_dict = vectorizer.get_feature_names_out()
    for (item_index, score) in sorted_vector[0:num_words]:
        word = index_dict[item_index]
        words.append(word)
    return words

In [28]:
def get_keywords_simple(vectorizer, input_text, num_output_words=10):
    vector = vectorizer.transform([input_text])
    sorted = sort_coo(vector.tocoo())
    words = get_keyword_strings(vectorizer, num_output_words, sorted)
    return words

In [29]:
print(test_df.iloc[0]["summary"])
keywords = get_keywords_simple(vectorizer, test_df.iloc[0]["summary"])
print(keywords)

– It's a race for the governor's mansion in 11 states today, and the GOP could end the night at the helm of more than two-thirds of the 50 states. The GOP currently controls 29 of the country's top state offices; it's expected to keep the three Republican ones that are up for grabs (Utah, North Dakota, and Indiana), and wrest North Carolina from the Dems. That brings its toll to 30, with the potential to take three more, reports NPR. Races in Montana, New Hampshire, and Washington are still too close to call, and in all three, Democrat incumbents aren't seeking reelection. The results could have a big impact on health care, since a Supreme Court ruling grants states the ability to opt out of ObamaCare's Medicaid expansion. "A Romney victory would dramatically empower Republican governors," said one analyst. Click for NPR's state-by-state breakdown of what could happen.
['states', 'npr', 'wrest', 'state', 'gop', 'incumbents', 'republican', 'empower', 'north', 'reelection']


In [67]:
stop_words = list(stopwords.words('english'))
stop_words.remove("the")
trigram_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=2, ngram_range=(1,3), max_df=0.95)
trigram_vectorizer.fit(train_df["summary"])

In [68]:
def get_keyword_strings_all(vectorizer, sorted_vector):
    words = []
    index_dict = vectorizer.get_feature_names_out()
    for (item_index, score) in sorted_vector:
        word = index_dict[item_index]
        words.append(word)
    return words

In [73]:
def get_keywords_complex(vectorizer, input_text, spacy_model, num_words=70):
    keywords = []
    doc = spacy_model(input_text)
    vector = vectorizer.transform([input_text])
    sorted = sort_coo(vector.tocoo())
    ngrams = get_keyword_strings_all(vectorizer, sorted)
    ents = [ent.text.lower() for ent in doc.noun_chunks]
    for i in range(0, num_words):
        keyword = ngrams[i]
        if keyword.lower() in ents and not keyword.isdigit() and keyword not in keywords:
            keywords.append(keyword)
    return keywords

In [74]:
keywords = get_keywords_complex(trigram_vectorizer, test_df.iloc[0]["summary"], small_model)
print(keywords)

['the gop', 'the 50 states', 'npr', '11 states', 'state', 'republican governors', 'the dems', 'reelection', 'the helm', 'grabs']
