In [63]:
from collections import Counter
from math import log

In [64]:
# List of sentences
sentences = [
    "sunshine state enjoy sunshine",
    "brown fox jump high, brown fox run",
    "sunshine state fox run fast",
]

In [65]:
# Create a vocabulary list of unique words from the sentences
vocab = set()
for sentence in sentences:
    for word in sentence.split():
        vocab.add(word)
vocab = list(vocab)

In [66]:
# Document-term matrix
doc_term_matrix = []
for sentence in sentences:
   
    bow_vector = [0] * len(vocab)
    for word in sentence.split():
        if word in vocab:
            bow_vector[vocab.index(word)] += 1
    doc_term_matrix.append(bow_vector)

In [67]:
# Compute the TF vectors
tf_vectors = []
for bow_vector in doc_term_matrix:
    # Normalize the BoW vector by dividing each element by the total number of words in the sentence
    tf_vector = [word_count / len(bow_vector) for word_count in bow_vector]
    tf_vectors.append(tf_vector)

In [68]:
# Compute the IDF values
idf_values = []
for term in vocab:
    # Count the number of documents that contain the term
    doc_count = 0
    for bow_vector in doc_term_matrix:
        if bow_vector[vocab.index(term)] > 0:
            doc_count += 1
    # IDF value for the term
    idf_value = log(len(sentences) / doc_count)
    idf_values.append(idf_value)

In [69]:
# Calculate the TF.IDF vectors
tfidf_vectors = []
for tf_vector in tf_vectors:
    #TF vector * IDF values
    tfidf_vector = [tf * idf for tf, idf in zip(tf_vector, idf_values)]
    tfidf_vectors.append(tfidf_vector)


In [70]:
# Print the results
print("Vocabulary:\n",  vocab)
print("Document-term matrix:\n",doc_term_matrix)
print("TF vectors:\n ",  tf_vectors)
print("IDF values:\n",  idf_values)
print("TF.IDF vectors:\n",  tfidf_vectors)

Vocabulary:
 ['brown', 'state', 'sunshine', 'jump', 'fox', 'high,', 'fast', 'enjoy', 'run']
Document-term matrix:
 [[0, 1, 2, 0, 0, 0, 0, 1, 0], [2, 0, 0, 1, 2, 1, 0, 0, 1], [0, 1, 1, 0, 1, 0, 1, 0, 1]]
TF vectors:
  [[0.0, 0.1111111111111111, 0.2222222222222222, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111, 0.0], [0.2222222222222222, 0.0, 0.0, 0.1111111111111111, 0.2222222222222222, 0.1111111111111111, 0.0, 0.0, 0.1111111111111111], [0.0, 0.1111111111111111, 0.1111111111111111, 0.0, 0.1111111111111111, 0.0, 0.1111111111111111, 0.0, 0.1111111111111111]]
IDF values:
 [1.0986122886681098, 0.4054651081081644, 0.4054651081081644, 1.0986122886681098, 0.4054651081081644, 1.0986122886681098, 1.0986122886681098, 1.0986122886681098, 0.4054651081081644]
TF.IDF vectors:
 [[0.0, 0.04505167867868493, 0.09010335735736986, 0.0, 0.0, 0.0, 0.0, 0.12206803207423442, 0.0], [0.24413606414846883, 0.0, 0.0, 0.12206803207423442, 0.09010335735736986, 0.12206803207423442, 0.0, 0.0, 0.04505167867868493], [0.0, 0.0450

In [None]:
# Question no 2


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
# Define the strings
S1 = "sunshine state enjoy sunshine"
S3 = "sunshine state fox run fast"

In [81]:

# Create a list of the strings
strings = [S1, S3]

In [82]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()


In [84]:
# Convert the strings to a matrix of Tf-idf features
tfidf_matrix = vectorizer.fit_transform(strings)

In [86]:
# Print the cosine similarity
print(cosine_similarity[0][1])

0.40347057701869526
