In [6]:
import nltk
nltk.download('reuters')
from nltk.corpus import reuters

# Get some documents from the Reuters corpus
docs = reuters.fileids()[:10]  # For example, first 10 documents

documents = [reuters.raw(doc_id) for doc_id in docs]

# Continue preprocessing as done above, and repeat the steps for vectorization and n-gram generation.


[nltk_data] Downloading package reuters to /home/sami/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [7]:
import spacy
from gensim import corpora

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Define your corpus of sentences
documents = [
    u"Albert Camus was a French philosopher, author, and journalist, known for his contributions to existentialism and absurdism.",
    u"Born in Algeria in 1913, Camus spent much of his life exploring the themes of human existence and the meaning of life.",
    u"His most famous works include 'The Stranger,' a novel that delves into the absurdity of life, and 'The Myth of Sisyphus,' an essay that examines existential philosophy.",
    u"Camus believed that life was inherently meaningless, but that individuals could find purpose by embracing personal freedom and moral responsibility.",
    u"In 1957, he was awarded the Nobel Prize in Literature for his significant contributions to literature and philosophy.",
    u"Camus' philosophy was deeply influenced by the political turmoil of his time, particularly his experiences during World War II and the French-Algerian conflict.",
    u"Although often associated with existentialism, Camus rejected the label, preferring to define his own ideas, which focused more on the absurd nature of life rather than existential despair.",
    u"He died tragically in a car accident in 1960, leaving behind a legacy that continues to influence modern thought and literature.",
    u"Today, Albert Camus is celebrated for his exploration of the human condition, ethics, and the search for meaning in a world that often seems indifferent."
]


# Preprocess the text
texts = []
for document in documents:
    text = []
    doc = nlp(document)
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num:
            text.append(token.lemma_)
    texts.append(text)

print("Preprocessed Texts: ", texts)


Preprocessed Texts:  [['Albert', 'Camus', 'french', 'philosopher', 'author', 'journalist', 'know', 'contribution', 'existentialism', 'absurdism'], ['bear', 'Algeria', 'Camus', 'spend', 'life', 'explore', 'theme', 'human', 'existence', 'meaning', 'life'], ['famous', 'work', 'include', 'Stranger', 'novel', 'delve', 'absurdity', 'life', 'Myth', 'Sisyphus', 'essay', 'examine', 'existential', 'philosophy'], ['Camus', 'believe', 'life', 'inherently', 'meaningless', 'individual', 'find', 'purpose', 'embrace', 'personal', 'freedom', 'moral', 'responsibility'], ['award', 'Nobel', 'Prize', 'Literature', 'significant', 'contribution', 'literature', 'philosophy'], ['Camus', 'philosophy', 'deeply', 'influence', 'political', 'turmoil', 'time', 'particularly', 'experience', 'World', 'War', 'II', 'french', 'algerian', 'conflict'], ['associate', 'existentialism', 'Camus', 'reject', 'label', 'prefer', 'define', 'idea', 'focus', 'absurd', 'nature', 'life', 'existential', 'despair'], ['die', 'tragically',

In [8]:
# Create a Gensim dictionary from the preprocessed texts
dictionary = corpora.Dictionary(texts)

# Print out the dictionary: word-to-id mapping
print("Dictionary Token to ID mapping: ", dictionary.token2id)

# Convert each document to the bag-of-words format
corpus = [dictionary.doc2bow(text) for text in texts]
print("Bag of Words Corpus: ", corpus)


Dictionary Token to ID mapping:  {'Albert': 0, 'Camus': 1, 'absurdism': 2, 'author': 3, 'contribution': 4, 'existentialism': 5, 'french': 6, 'journalist': 7, 'know': 8, 'philosopher': 9, 'Algeria': 10, 'bear': 11, 'existence': 12, 'explore': 13, 'human': 14, 'life': 15, 'meaning': 16, 'spend': 17, 'theme': 18, 'Myth': 19, 'Sisyphus': 20, 'Stranger': 21, 'absurdity': 22, 'delve': 23, 'essay': 24, 'examine': 25, 'existential': 26, 'famous': 27, 'include': 28, 'novel': 29, 'philosophy': 30, 'work': 31, 'believe': 32, 'embrace': 33, 'find': 34, 'freedom': 35, 'individual': 36, 'inherently': 37, 'meaningless': 38, 'moral': 39, 'personal': 40, 'purpose': 41, 'responsibility': 42, 'Literature': 43, 'Nobel': 44, 'Prize': 45, 'award': 46, 'literature': 47, 'significant': 48, 'II': 49, 'War': 50, 'World': 51, 'algerian': 52, 'conflict': 53, 'deeply': 54, 'experience': 55, 'influence': 56, 'particularly': 57, 'political': 58, 'time': 59, 'turmoil': 60, 'absurd': 61, 'associate': 62, 'define': 63,

In [9]:
from gensim import models

tfidf_model = models.TfidfModel(corpus)

for doc in tfidf_model[corpus]:
    print("TF-IDF Representation: ", doc)


TF-IDF Representation:  [(0, 0.26043946397874795), (1, 0.07020856482791948), (2, 0.38046179830165694), (3, 0.38046179830165694), (4, 0.26043946397874795), (5, 0.26043946397874795), (6, 0.26043946397874795), (7, 0.38046179830165694), (8, 0.38046179830165694), (9, 0.38046179830165694)]
TF-IDF Representation:  [(1, 0.06731057576798621), (10, 0.3647575301414525), (11, 0.3647575301414525), (12, 0.3647575301414525), (13, 0.3647575301414525), (14, 0.24968934083871247), (15, 0.26924230307194486), (16, 0.24968934083871247), (17, 0.3647575301414525), (18, 0.3647575301414525)]
TF-IDF Representation:  [(15, 0.10719188234526751), (19, 0.290437615555719), (20, 0.290437615555719), (21, 0.290437615555719), (22, 0.290437615555719), (23, 0.290437615555719), (24, 0.290437615555719), (25, 0.290437615555719), (26, 0.19881474895049328), (27, 0.290437615555719), (28, 0.290437615555719), (29, 0.290437615555719), (30, 0.1452188077778595), (31, 0.290437615555719)]
TF-IDF Representation:  [(1, 0.0552137545667147

In [10]:
import gensim

# Train the bigram model on the texts
bigram = gensim.models.Phrases(texts, min_count=1, threshold=2)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Apply the bigram model to transform the corpus
texts_bigrams = [bigram_mod[text] for text in texts]
print("Bigrams: ", texts_bigrams)

# Train the trigram model on top of the bigrams
trigram = gensim.models.Phrases(texts_bigrams, min_count=1, threshold=2)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Apply the trigram model to transform the corpus
texts_trigrams = [trigram_mod[text] for text in texts_bigrams]
print("Trigrams: ", texts_trigrams)


Bigrams:  [['Albert_Camus', 'french', 'philosopher', 'author', 'journalist', 'know', 'contribution', 'existentialism', 'absurdism'], ['bear', 'Algeria', 'Camus', 'spend', 'life', 'explore', 'theme', 'human', 'existence', 'meaning', 'life'], ['famous', 'work', 'include', 'Stranger', 'novel', 'delve', 'absurdity', 'life', 'Myth', 'Sisyphus', 'essay', 'examine', 'existential', 'philosophy'], ['Camus', 'believe', 'life', 'inherently', 'meaningless', 'individual', 'find', 'purpose', 'embrace', 'personal', 'freedom', 'moral', 'responsibility'], ['award', 'Nobel', 'Prize', 'Literature', 'significant', 'contribution', 'literature', 'philosophy'], ['Camus', 'philosophy', 'deeply', 'influence', 'political', 'turmoil', 'time', 'particularly', 'experience', 'World', 'War', 'II', 'french', 'algerian', 'conflict'], ['associate', 'existentialism', 'Camus', 'reject', 'label', 'prefer', 'define', 'idea', 'focus', 'absurd', 'nature', 'life', 'existential', 'despair'], ['die', 'tragically', 'car', 'accid