In [5]:
import spacy
from gensim import corpora, models
import gensim
from nltk.corpus import reuters  # Use NLTK's Reuters corpus
import nltk

# Download the Reuters corpus
nltk.download('reuters')
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Extracting documents from Reuters corpus
documents = [reuters.raw(fileid) for fileid in reuters.fileids()[:100]]  # Limit to first 100 documents for example

# Preprocess the text
texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for w in doc:
        if not w.is_stop and not w.is_punct and not w.like_num:
            text.append(w.lemma_)
    texts.append(text)

# Create bigrams and trigrams
bigram = gensim.models.Phrases(texts, min_count=2, threshold=1)
trigram = gensim.models.Phrases(bigram[texts], min_count=1, threshold=1)

# Transform the texts to include bigrams and trigrams
bigram_texts = [bigram[line] for line in texts]
trigram_texts = [trigram[line] for line in bigram_texts]

# Create a dictionary and corpus for bag-of-words
dictionary = corpora.Dictionary(trigram_texts)
corpus = [dictionary.doc2bow(text) for text in trigram_texts]

# Create TF-IDF representation
tfidf_model = models.TfidfModel(corpus)
tfidf_corpus = tfidf_model[corpus]

# Print outputs
print("Dictionary (word_id):")
print(dictionary.token2id)

print("\nBigrams:")
for text in bigram_texts:
    print(text)

print("\nTrigrams:")
for text in trigram_texts:
    print(text)

print("\nTF-IDF Representation:")
for doc in tfidf_corpus:
    print(doc)


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dictionary (word_id):

Bigrams:
['CHINA', 'DAILY', 'say', 'VERMIN', 'EAT', 'PCT', 'GRAIN', 'stock', '\n  ', 'survey', 'province', 'city', '\n  ', 'show', 'vermin', 'consume', 'pct', 'China', 'grain', '\n  ', 'stock', 'China', 'Daily', 'say_\n      ', 'say', 'year', 'mln_tonne', 'pct_\n  ', 'China', 'fruit', 'output', 'leave', 'rot', 'mln_tonne', '\n  ', 'pct', 'vegetable', 'paper', 'blame', 'waste', '\n  ', 'inadequate', 'storage', 'bad', 'preservation', 'method', '\n      _say', 'government', 'launch', 'national', 'programme', '\n  ', 'reduce', 'waste', 'call', 'improved', 'technology', 'storage', '\n  ', 'preservation', 'great', 'production', 'additive', 'paper', '\n  ', 'give_detail', '\n  \n\n']
['JAPAN', 'REVISE', 'long_term', 'energy', 'demand', 'downward', '\n  ', 'Ministry', 'International_Trade', '\n  _Industry', 'MITI', 'revise', 'long_term', 'energy', 'supply_demand', '\n  ', 'outlook', 'August', 'meet', 'forecast', 'downtrend', 'japanese', '\n  ', 'energy', 'demand', 'minis