In [1]:
 # ================================
# FULL IMPLEMENTATION OF
# TF-IDF AND N-GRAM MODELS
# ================================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# -------------------------------
# 1. Input Article (100 words)
# -------------------------------

article = """
Machine learning is a branch of artificial intelligence that enables computers to learn from data.
It focuses on building models that can identify patterns and make predictions without being explicitly programmed.
Machine learning is widely used in areas such as healthcare finance education and transportation.
Common applications include recommendation systems spam detection image recognition and speech processing.
As data continues to grow rapidly machine learning plays an important role in automating decision making.
"""

documents = [article]   # TF-IDF expects a list of documents

# -------------------------------
# 2. TF-IDF MODEL
# -------------------------------

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert TF-IDF to readable form
tfidf_scores = tfidf_matrix.toarray()[0]
words = tfidf_vectorizer.get_feature_names_out()

print("===== TF-IDF SCORES =====")
for word, score in zip(words, tfidf_scores):
    print(f"{word:15} : {score:.4f}")

# -------------------------------
# 3. N-GRAM MODELS
# -------------------------------

# UNIGRAM (n=1)
unigram_vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english')
unigram = unigram_vectorizer.fit_transform(documents)

print("\n===== UNIGRAMS =====")
print(unigram_vectorizer.get_feature_names_out())

# BIGRAM (n=2)
bigram_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english')
bigram = bigram_vectorizer.fit_transform(documents)

print("\n===== BIGRAMS =====")
print(bigram_vectorizer.get_feature_names_out())

# TRIGRAM (n=3)
trigram_vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english')
trigram = trigram_vectorizer.fit_transform(documents)

print("\n===== TRIGRAMS =====")
print(trigram_vectorizer.get_feature_names_out())


===== TF-IDF SCORES =====
applications    : 0.1250
areas           : 0.1250
artificial      : 0.1250
automating      : 0.1250
branch          : 0.1250
building        : 0.1250
common          : 0.1250
computers       : 0.1250
continues       : 0.1250
data            : 0.2500
decision        : 0.1250
detection       : 0.1250
education       : 0.1250
enables         : 0.1250
explicitly      : 0.1250
finance         : 0.1250
focuses         : 0.1250
grow            : 0.1250
healthcare      : 0.1250
identify        : 0.1250
image           : 0.1250
important       : 0.1250
include         : 0.1250
intelligence    : 0.1250
learn           : 0.1250
learning        : 0.3750
machine         : 0.3750
make            : 0.1250
making          : 0.1250
models          : 0.1250
patterns        : 0.1250
plays           : 0.1250
predictions     : 0.1250
processing      : 0.1250
programmed      : 0.1250
rapidly         : 0.1250
recognition     : 0.1250
recommendation  : 0.1250
role            : 0.1250