In [1]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from unidecode import unidecode
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
imdb_df = pd.read_csv("imdb.csv")
movie_titles = imdb_df["Series_Title"].tolist()

# https://www.kaggle.com/datasets/basilb2s/language-detection?resource=download
language_dataset = pd.read_csv("ld.csv")
texts = language_dataset["Text"]
languages = language_dataset["Language"]

In [3]:
english_text = "This is an example sentence in English."
spanish_text = "Este es un ejemplo de frase en español."
french_text = "Ceci est un exemple de phrase en français."
german_text = "Dies ist ein Beispiel für einen Satz auf Deutsch."
italian_text = "Questo è un esempio di frase in italiano."

languages_test = {
    "english": english_text,
    "spanish": spanish_text,
    "french": french_text,
    "german": german_text,
    "italian": italian_text,
}

### Question A

In [4]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jarvis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jarvis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jarvis/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
def detect_language(text, languages):
    stop_words_count = {}

    for lang in languages:
        stop_words = set(stopwords.words(lang))
        words = nltk.word_tokenize(text.lower())
        common_stop_words = set([word for word in words if word in stop_words])
        stop_words_count[lang] = len(common_stop_words)

    detected_language = max(stop_words_count, key=stop_words_count.get)
    return detected_language


detected_lang = detect_language(languages_test.get("italian"), languages_test)
print(f"The detected language is: {detected_lang}")

The detected language is: italian


### Question B

In [6]:
# Function to perform subword tokenization on a given text
def subword_tokenization(text, lengths=[2, 3, 4]):
    subtokens = set()
    for length in lengths:
        for i in range(len(text) - length + 1):
            subtoken = text[i : i + length]
            subtokens.add(subtoken)
    return subtokens


# Extract movie titles from the DataFrame and convert to a list
movie_titles = imdb_df["Series_Title"].tolist()

set_of_word_representation = set()

# Generate subword representations for all movie titles and store in a set
for title in movie_titles:
    subtokens = subword_tokenization(title)
    set_of_word_representation.update(subtokens)


# Function to calculate similarity between two texts based on subword tokenization
def similarity(query, title):
    query_subtokens = subword_tokenization(query)
    title_subtokens = subword_tokenization(title)

    # Calculate intersection and union of subtokens
    intersection = len(query_subtokens.intersection(title_subtokens))
    union = len(query_subtokens.union(title_subtokens))

    # Calculate similarity score to measure overlap
    similarity_score = intersection / union if union > 0 else 0

    return similarity_score


query = "Lion"
similarity_threshold = 0.1

# Compare query with movie titles and print titles with similarity above the threshold
for title in movie_titles:
    score = similarity(query, title)
    if score > similarity_threshold:
        print(f"Title: {title}, Similarity Score: {score}")

Title: Inception, Similarity Score: 0.125
Title: The Lion King, Similarity Score: 0.18181818181818182
Title: Lion, Similarity Score: 1.0
Title: Ying xiong, Similarity Score: 0.11538461538461539
Title: The Lion in Winter, Similarity Score: 0.13043478260869565
Title: Repulsion, Similarity Score: 0.125


### Question C

In [7]:
# Encode movie titles to get dense vectors
def encode_titles(model, titles):
    return model.encode(titles, convert_to_tensor=True)


# Perform semantic search for a given query and model
def semantic_search(model_name, query, movie_titles, similarity_threshold=0.7):
    start_time = time.time()

    # Load the selected sentence transformer model
    model = SentenceTransformer(model_name)

    # Encode movie titles to get dense vectors
    encoded_titles = encode_titles(model, movie_titles)

    # Encode the query to get its dense vector representation
    encoded_query = model.encode(query, convert_to_tensor=True)

    # Calculate cosine similarity between the query and movie titles
    cos_similarities = util.pytorch_cos_sim(encoded_query, encoded_titles)

    # Find titles that are semantically similar to the query
    similar_titles = []
    for i, score in enumerate(cos_similarities[0]):
        if score > similarity_threshold:
            similar_titles.append((movie_titles[i], score))

    end_time = time.time()
    execution_time = end_time - start_time

    return similar_titles, execution_time


# Define the query
query = "Father"

# List of different models to benchmark
model_names = [
    "paraphrase-MiniLM-L6-v2",
    "all-MiniLM-L6-v2",
    "average_word_embeddings_glove.6B.300d",
]

# Benchmark different models
for model_name in model_names:
    print(f"Model: {model_name}")
    similar_titles, execution_time = semantic_search(model_name, query, movie_titles)
    print(f"Similar Titles:")
    for title, score in similar_titles:
        print(f"Title: {title}, Similarity Score: {score}")
    print(f"Execution Time: {execution_time:.4f} seconds")
    print("-" * 40)

Model: paraphrase-MiniLM-L6-v2
Similar Titles:
Title: In the Name of the Father, Similarity Score: 0.8266555666923523
Execution Time: 13.1069 seconds
----------------------------------------
Model: all-MiniLM-L6-v2
Similar Titles:
Execution Time: 2.3783 seconds
----------------------------------------
Model: average_word_embeddings_glove.6B.300d
Similar Titles:
Title: In the Name of the Father, Similarity Score: 1.0
Execution Time: 4.5499 seconds
----------------------------------------


### Question D

In [8]:
# Initialize defaultdict to store sub-sequences for each language
sub_sequences_per_language = defaultdict(list)

# Generate sub-sequences for each text and respective language
for text, language in zip(texts, languages):
    sub_sequences = subword_tokenization(text)
    sub_sequences_per_language[language].extend(sub_sequences)

# Count occurrences of sub-sequences per language
top_n_sub_sequences = {}
top_n = 1000  # Set the number of top sub-sequences per language

for language, sequences in sub_sequences_per_language.items():
    sequence_counter = Counter(sequences)
    top_sequences = sequence_counter.most_common(top_n)
    top_n_sub_sequences[language] = [seq for seq, _ in top_sequences]

# Flatten top sub-sequences for training
X_train = []
y_train = []

for language, sequences in top_n_sub_sequences.items():
    X_train.extend(sequences)
    y_train.extend([language] * len(sequences))

# Vectorize the sub-sequences
vectorizer = CountVectorizer(analyzer="char", ngram_range=(2, 4))
X_train_vectorized = vectorizer.fit_transform(X_train)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)


# Function to predict language
def predict_language(text, vectorizer, nb_classifier):
    sub_sequences = subword_tokenization(text)
    text_vectorized = vectorizer.transform(sub_sequences)
    predicted_language = nb_classifier.predict(text_vectorized)
    return predicted_language

In [9]:
# Predict the language of the test text
predicted_language = predict_language(
    languages_test.get("german"), vectorizer, nb_classifier
)
print(f"Test Text: {languages_test.get('german')}")
print(f"Predicted Language: {predicted_language[0]}")

Test Text: Dies ist ein Beispiel für einen Satz auf Deutsch.
Predicted Language: Turkish
