### N-Gram Model

In [1]:
# N-Gram model.

def generate_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = " ".join(words[i:i + n])
        ngrams.append(ngram)
    return ngrams

# Example usage
text = "I'm always waiting for you to be waiting below Devils roll the dice, angels roll their eyes What doesn't kill me makes me want you more."
n = int(input("Enter the n: "))  # You can change this to any desired N
ngrams = generate_ngrams(text, n)
print(ngrams)

Enter the n:  4


["I'm always waiting for", 'always waiting for you', 'waiting for you to', 'for you to be', 'you to be waiting', 'to be waiting below', 'be waiting below Devils', 'waiting below Devils roll', 'below Devils roll the', 'Devils roll the dice,', 'roll the dice, angels', 'the dice, angels roll', 'dice, angels roll their', 'angels roll their eyes', 'roll their eyes What', "their eyes What doesn't", "eyes What doesn't kill", "What doesn't kill me", "doesn't kill me makes", 'kill me makes me', 'me makes me want', 'makes me want you', 'me want you more.']


### TF-IDF Model

TF = Number of times term appears in document / Total number of terms in document \
IDF = log(Total number of documents / Number of documents containing the term)

In [2]:
#TF-IDF model.
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_matrix_dense = tfidf_matrix.toarray()

for i, doc in enumerate(documents):
    print(f"Document {i + 1}: {doc}")
    for j, term in enumerate(feature_names):
        tfidf_score = tfidf_matrix_dense[i, j]
        if tfidf_score > 0:
            print(f"{term}: {tfidf_score:.4f}")
    print()


Document 1: This is the first document.
document: 0.4698
first: 0.5803
is: 0.3841
the: 0.3841
this: 0.3841

Document 2: This document is the second document.
document: 0.6876
is: 0.2811
second: 0.5386
the: 0.2811
this: 0.2811

Document 3: And this is the third one.
and: 0.5118
is: 0.2671
one: 0.5118
the: 0.2671
third: 0.5118
this: 0.2671

Document 4: Is this the first document?
document: 0.4698
first: 0.5803
is: 0.3841
the: 0.3841
this: 0.3841

