In [50]:
# 11/Dec/2023
# CSC461 – Assignment4 – NLP
# Usama Tufail
# FA21-BSE-053
# Compute BoW, TF, IDF, and then TF.IDF values for each term Sentences and also Compute the similarity between S1, S2, and S3 using cosine, manhattan, and euclidean distances.

In [None]:
!pip install scikit-learn

In [36]:
# Import important libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cityblock
import pandas as pd
import math


In [44]:
# Sample data
documents = (
    "data science is one of the most important courses in computer science",
    "this is one of the best data science courses",
    "the data scientists perform data analysis"
)

In [45]:
# Find bag of words
count_vectorizer = CountVectorizer()
matrix= count_vectorizer.fit_transform(documents)
tokens = count_vectorizer.get_feature_names_out()
bow_matrix = matrix.toarray()
df_bow = pd.DataFrame(data = bow_matrix, columns = tokens)
print("Bag of Words Matrix:")
print(df_bow)


Bag of Words Matrix:
   analysis  best  computer  courses  data  important  in  is  most  of  one  \
0         0     0         1        1     1          1   1   1     1   1    1   
1         0     1         0        1     1          0   0   1     0   1    1   
2         1     0         0        0     2          0   0   0     0   0    0   

   perform  science  scientists  the  this  
0        0        2           0    1     0  
1        0        1           0    1     1  
2        1        0           1    1     0  


In [46]:
# Find term frequency
def calculate_tf(corpus):
    tf_list = []
    for document in corpus:
        term_freq_dict = {}
        total_words = len(document.split())
        for word in document.split():
            term_freq_dict[word] = term_freq_dict.get(word, 0) + 1

        for word, freq in term_freq_dict.items():
            term_freq_dict[word] = round(freq / total_words, 2)

        tf_list.append(term_freq_dict)

    return tf_list

tf_results = calculate_tf(documents)
df_tf = pd.DataFrame(tf_results).fillna(0)


In [47]:
# Find inverse document frequency
def calculate_idf(corpus):
    N = len(corpus)
    term_doc_count = {}
    for document in corpus:
        unique_terms = set(document.split())
        for term in unique_terms:
            term_doc_count[term] = term_doc_count.get(term, 0) + 1

    idf_dict = {}
    for term, doc_count in term_doc_count.items():
        idf_dict[term] = round(math.log(N / (1 + doc_count)), 2)

    return idf_dict

idf_dict = calculate_idf(documents)
df_idf = pd.DataFrame(list(idf_dict.items()), columns=['Term', 'IDF'])


In [48]:
# Find TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = round(tfidf_vectorizer.fit_transform(documents), 2)

tfidf_tokens = tfidf_vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(data=tfidf_matrix.toarray(), columns=tfidf_tokens)


In [49]:
# Generate cosine similarity matrix
cosine_similarity_matrix = cosine_similarity(tfidf_matrix)
df_cosine_similarity = pd.DataFrame(data=cosine_similarity_matrix)

# Calculate Manhattan distance
manhattan_distance_s1_s2 = round(1 / cityblock(df_tfidf.iloc[0], df_tfidf.iloc[1]), 2)
manhattan_distance_s1_s3 = round(1 / cityblock(df_tfidf.iloc[0], df_tfidf.iloc[2]), 2)
manhattan_distance_s2_s3 = round(1 / cityblock(df_tfidf.iloc[1], df_tfidf.iloc[2]), 2)

# Calculate Euclidean distances
euclidean_distance_s1_s2 = round(math.dist(df_tfidf.iloc[0], df_tfidf.iloc[1]), 2)
euclidean_distance_s2_s3 = round(math.dist(df_tfidf.iloc[1], df_tfidf.iloc[2]), 2)
euclidean_distance_s1_s3 = round(math.dist(df_tfidf.iloc[0], df_tfidf.iloc[2]), 2)

# Display results


print("\nTerm Frequency Matrix:")
print(df_tf)

print("\nInverse Document Frequency:")
print(df_idf)

print("\nTF-IDF Matrix:")
print(df_tfidf)

print("\nCosine Similarity Matrix:")
print(df_cosine_similarity)

print("\nManhattan Distance:")
print("S1 and S2:", manhattan_distance_s1_s2)
print("S1 and S3:", manhattan_distance_s1_s3)
print("S2 and S3:", manhattan_distance_s2_s3)

print("\nEuclidean Distance:")
print("S1 and S2:", euclidean_distance_s1_s2)
print("S2 and S3:", euclidean_distance_s2_s3)
print("S1 and S3:", euclidean_distance_s1_s3)



Term Frequency Matrix:
   data  science    is   one    of   the  most  important  courses    in  \
0  0.08     0.17  0.08  0.08  0.08  0.08  0.08       0.08     0.08  0.08   
1  0.11     0.11  0.11  0.11  0.11  0.11  0.00       0.00     0.11  0.00   
2  0.33     0.00  0.00  0.00  0.00  0.17  0.00       0.00     0.00  0.00   

   computer  this  best  scientists  perform  analysis  
0      0.08  0.00  0.00        0.00     0.00      0.00  
1      0.00  0.11  0.11        0.00     0.00      0.00  
2      0.00  0.00  0.00        0.17     0.17      0.17  

Inverse Document Frequency:
          Term   IDF
0     computer  0.41
1           is  0.00
2         data -0.29
3          the -0.29
4    important  0.41
5          one  0.00
6           of  0.00
7      science  0.00
8           in  0.41
9      courses  0.00
10        most  0.41
11        best  0.41
12        this  0.41
13     perform  0.41
14  scientists  0.41
15    analysis  0.41

TF-IDF Matrix:
   analysis  best  computer  courses  dat