In [78]:
import math

class TopicExtractor:
    def __init__(self, docs, stopwords=set()):
        self.vocab = set()
        self.word_counts = {}
        self.doc_topic_counts = {}
        self.topic_counts = {}
        self.num_topics = 0
        self.stopwords = stopwords
        
        for doc in docs:
            title, text = doc['title'], doc['text']
            doc_topics = set()
            words = self.tokenize(title + ' ' + text)
            for word in words:
                if word not in self.word_counts:
                    self.word_counts[word] = 0
                if word not in self.vocab:
                    self.vocab.add(word)
                
                self.word_counts[word] += 1
                
                if len(doc_topics) == 0 or word in title:
                    topic = self.add_topic()
                    doc_topics.add(topic)
                
                for topic in doc_topics:
                    if topic not in self.doc_topic_counts:
                        self.doc_topic_counts[topic] = {}
                    if word not in self.doc_topic_counts[topic]:
                        self.doc_topic_counts[topic][word] = 0
                    
                    self.doc_topic_counts[topic][word] += 1
                    self.topic_counts[topic] += 1
        
    def add_topic(self):
        self.num_topics += 1
        self.topic_counts[self.num_topics] = 0
        return self.num_topics
        
    def tokenize(self, text):
        # Split text into tokens, removing punctuation and converting to lowercase
        return [word.strip('.,!?:;()[]{}/\\\'\"').lower() for word in text.split()]
    
    def p_word_topic(self, word, topic):
        # Compute P(word|topic) using Laplace smoothing
        num = self.doc_topic_counts[topic].get(word, 0) + 1
        denom = self.topic_counts[topic] + len(self.vocab)
        return num / denom if word not in self.stopwords else num / denom / 10
    
    def p_topic_doc(self, doc, topic):
        # Compute P(topic|doc) using Bayes' rule and assuming uniform prior P(topic)
        words = self.tokenize(doc['title'] + ' ' + doc['text'])
        title_words = self.tokenize(doc['title'])
        
        # Use title words as prior, with higher weight
        p = math.log(sum(self.p_word_topic(word, topic) for word in title_words))
        p += sum(math.log(self.p_word_topic(word, topic)) for word in words if word not in title_words)
        
        return p
    
    def extract_topics(self, doc):
        # Extract the most probable topic for a given document
        topics = list(range(1, self.num_topics+1))
        probs = [self.p_topic_doc(doc, topic) for topic in topics]
        max_prob = max(probs)
        max_topic = topics[probs.index(max_prob)]
        return self.topic_words(max_topic)
    
    def topic_words(self, topic, n_words=10):
        # Return the top n_words words for a given topic, based on P(word|topic)
        word_probs = [(word, self.p_word_topic(word, topic)) for word in self.vocab]
        top_words = sorted(word_probs, key=lambda x: x[1], reverse=True)[:n_words]
        return [word[0] for word in top_words]



In [80]:
import json
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

docs=[]
with open('top_articles.json','r') as f:
    docs=json.load(f)

te = TopicExtractor(docs, stop_words)
for doc in docs:
    main_topic = te.extract_topics(doc)
    #update the doc with the main topic
    doc['title'] = main_topic
    print(doc['title'])
    



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ay477\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['grayscale', 'says', 'gbtc', 'shares', 'bitcoin', 'investors', 'trust', 'the', 'sec', 'sonnenshein']
['tablet', 'netflix', 'new', 'oneplus', 'android', 'the', 'account', 'service', 'used', '+']
['mining', 'says', 'bitcoin', 'debt', 'miners', 'company', 'the', 'price', 'facilities', 'market']
['el', 'bitcoin', 'country', 'salvador', 'crypto', 'salvador’s', 'last', 'embassy', 'texas', 'country’s']
['market', 'bitcoin', 'since', 'crypto', 'said', 'investors', 'week', 'news', 'data', 'last']
['people', 'said', 'crypto', 'cryptocurrency', 'best', 'munger', 'picks', 'completely', 'call', 'currencies']
['bitcoin', 'el', 'cryptocurrency', 'texas', 'november', 'salvador', 'adoption', 'mayorga', 'state', 'embassy']
['bitcoin', 'price', 'griffin', 'token', 'one', 'floors', 'still', 'single', 'whale', 'finance']
['bitcoin', 'market', 'best', 'current', 'price', 'crypto', 'cryptocurrency', 'report', 'said', "bitcoin's"]
['grayscale', 'bitcoin', 'sec', 'reuters', 'genesis', 'sonnenshein', 'court', 

In [None]:
# TODO: generate titles using the main topic



import spacy
import random

#set extension
spacy.tokens.Doc.set_extension("adjacencies", getter=lambda doc: [token.text for token in doc if token.pos_ == "ADJ" and not token._.is_stopword], force=True)

#download the model
nlp = spacy.load("en_core_web_sm")



def generate_title(words):
    # Join the words into a single string
    text = " ".join(words)
    
    # Parse the text using Spacy
    doc = nlp(text)
    
    # Extract the noun phrases from the document
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    
    # If there are no noun phrases, use the original words
    if len(noun_phrases) == 0:
        noun_phrases = words
    
    # Choose a random noun phrase and adjective to use in the title
    np = random.choice(noun_phrases)
    adj = random.choice(doc._.adjacencies)
    # Capitalize the first letter of the noun phrase and adjective
    np = np.capitalize()
    adj = adj.capitalize()
    
    # Combine the noun phrase and adjective to form the title
    title = f"{adj} {np}"
    
    return title

for doc in docs:
    title = generate_title(doc['title'])
    print(title)
    doc['title'] = title
