<font color='blue' size="+3">Design a Text Retrieval System
</font>


#### **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from collections import defaultdict, Counter
from math import log, sqrt

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Petter\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Petter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### **Reading Data**

In [24]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
#see the dataset overview
train_df.head()

Unnamed: 0,title,description
0,Tsotsi,A South African hoodlum named Tsotsi (Presley ...
1,Abducted in Plain Sight,In 1974 a 12-year-old girl is abducted from a ...
2,My Life Is Murder,Private investigator Alexa Crow always fights ...
3,Empire,Hip-hop artist and CEO of Empire Entertainment...
4,Latter Days,"Aaron Davis (Steve Sandvoss), a young Mormon, ..."


#### **Data Pre-processing**

In [25]:
train_df.dropna(subset=['title'], inplace=True)
train_df = train_df[train_df['description'].notna() & (train_df['description'] != '')]
test_df.dropna(subset=['title'], inplace=True)
test_df = test_df[test_df['description'].notna() & (test_df['description'] != '')]
# Stop words and punctuation translator
stop_words = set(stopwords.words('english'))
translator = str.maketrans('', '', string.punctuation)

#### **Text Preprocessing Function**

In [26]:
# Preprocessing function
def preprocess(text):
    if isinstance(text, float):
        return []
    text = text.lower()
    text = text.translate(translator)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Application of the preprocessing to training and test sets
train_df['description_tokens'] = train_df['description'].apply(lambda x: preprocess(x) if pd.notnull(x) else [])
test_df['description_tokens'] = test_df['description'].apply(lambda x: preprocess(x) if pd.notnull(x) else [])

#### **Term Frequency (TF)**

In [27]:
# Term Frequency (TF) calculation
def compute_tf(doc):
    tf = Counter(doc)
    for term in tf:
        tf[term] /= len(doc)
    return tf

train_df['tf'] = train_df['description_tokens'].apply(compute_tf)

#### **Document Frequency (DF)**

In [28]:
# Counts how many documents each term appears in.
df = Counter()
for tf in train_df['tf']:
    df.update(tf.keys())

#### **Inverse Document Frequency (IDF)**

In [29]:
# Calculates the inverse document frequency for each term.
num_docs = len(train_df)
idf = {term: log(num_docs / (1 + df[term])) for term in df}

#### **TF-IDF Calculation**

In [30]:
# Combines TF and IDF to compute the TF-IDF score for each term in each document.
def compute_tfidf(tf, idf):
    tfidf = {term: tf[term] * idf.get(term, 0) for term in tf}
    return tfidf

train_df['tfidf'] = train_df['tf'].apply(lambda tf: compute_tfidf(tf, idf))

#### **Inverted Index**

In [31]:
# Maps each term to a list of documents (by index) and their corresponding TF-IDF scores.
inverted_index = defaultdict(list)
for idx, tfidf in enumerate(train_df['tfidf']):
    for term, score in tfidf.items():
        inverted_index[term].append((idx, score))

#### **Cosine Similarity Calculation**

In [32]:
# Computes the cosine similarity between two TF-IDF vectors.
def compute_cosine_similarity(vec1, vec2):
    common_terms = set(vec1.keys()).intersection(set(vec2.keys()))
    dot_product = sum(vec1[term] * vec2[term] for term in common_terms)
    norm_vec1 = sqrt(sum(val**2 for val in vec1.values()))
    norm_vec2 = sqrt(sum(val**2 for val in vec2.values()))
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0
    return dot_product / (norm_vec1 * norm_vec2)

#### **Finding Similar Descriptions**

In [33]:
# Function to find similar descriptions
def find_similar_descriptions(test_desc, train_df, top_n=3):
    test_tf = compute_tf(test_desc)
    test_tfidf = compute_tfidf(test_tf, idf)
    similarities = []
    for idx, train_tfidf in enumerate(train_df['tfidf']):
        similarity = compute_cosine_similarity(test_tfidf, train_tfidf)
        similarities.append((idx, similarity))
    top_indices = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
    return [idx for idx, _ in top_indices], test_tfidf

#### **Generating Results**

In [34]:
# For each test description, finds the top 3 similar descriptions
#from the training data and stores the titles of the similar movies.
results = []
test_tfidfs = []
for test_desc in test_df['description_tokens']:
    similar_indices, test_tfidf = find_similar_descriptions(test_desc, train_df)
    similar_movies = train_df.iloc[similar_indices][['title', 'description', 'description_tokens', 'tfidf']].values.tolist()
    results.append(similar_movies)
    test_tfidfs.append(test_tfidf)

#### **Average TF-IDF Cosine Similarity**

In [35]:
# Average TF-IDF similarity
def calculate_average_tfidf_similarity(test_tfidf, similar_movies):
    similarities = []
    for movie in similar_movies:
        similarities.append(compute_cosine_similarity(test_tfidf, movie[3]))
    return np.mean(similarities)

average_cosine_similarities = []
for test_desc, test_tfidf, similar_movies in zip(test_df['description_tokens'], test_tfidfs, results):
    # jaccard_similarities = [calculate_jaccard_similarity(set(test_desc), set(movie[2])) for movie in similar_movies]
    cosine_similarities = [compute_cosine_similarity(test_tfidf, movie[3]) for movie in similar_movies]
    # average_jaccard_similarities.append(np.mean(jaccard_similarities))
    average_cosine_similarities.append(np.mean(cosine_similarities))

# average_jaccard_similarity = np.mean(average_jaccard_similarities)
average_cosine_similarity = np.mean(average_cosine_similarities)

# print(f"Average Jaccard Similarity: {average_jaccard_similarity:.2f}")
print(f"TF-IDF Cosine Similarities for each test-movie with its top 3:\n\n {average_cosine_similarities}\n")
print(f"Average TF-IDF Cosine Similarity: {average_cosine_similarity:.2f}")

TF-IDF Cosine Similarities for each test-movie with its top 3:

 [0.11602708091335938, 0.15051074656968355, 0.15267076138636576, 0.1436510421558677, 0.16862311975533684, 0.1731129520538688, 0.2260479765171135, 0.3600768643395129, 0.14463395997799042, 0.12536409673992438]

Average TF-IDF Cosine Similarity: 0.18


#### **Displaying Results**

In [37]:
# Display Results
for i, (test_movie, test_desc) in enumerate(zip(test_df['title'], test_df['description'])):
    print("-" * 110)
    print("-" * 110)
    print(f"Test Movie's title: {test_movie}")
    print(f"Test Movie's description: {test_desc}")
    print("-" * 110)
    print("TOP 3 SIMILAR MOVIES:")
    print()
    
    for movie in results[i]:
        print(f"\tTitle: {movie[0]}")
        print(f"\tDescription: {movie[1]}")
        print()
    print()


--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Test Movie's title: Splitting Up Together
Test Movie's description: Ellen DeGeneres serves as executive producer of this comedy that is based on a Danish series of the same name. Lena and Martin were certain enough that their marriage was over to make it official and go through the complicated untangling involved in filing for divorce. The last thing that they expected was to be brought back together by the experience, but they find their relationship strangely reignited by the experience. Together, they navigate their evolving relationship with open minds and newly reopened hearts.
--------------------------------------------------------------------------------------------------------------
TOP 3 SIMILAR MOVIES:

	Title: Happily Divorced
	Description: Fran Drescher s