In [1]:
# Importing necessary libraries
import os
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import json


In [2]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Load data from CSV file into a DataFrame
data_link = "/content/chatbot/Data/dialogues.csv"
df = pd.read_csv(data_link)

In [4]:
data_path = "/content/chatbot/Data/How_I met_your_mother_episodes_dialogues_parsed.json"

In [5]:
# Load data from JSON file into a DataFrame
with open(data_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extracting text from the nested structure
text_list = []
for episode in data:
    for dialogue in episode["dialogues"]:
        text_list.append(dialogue["text"])

In [7]:
# Tokenization and TF-IDF Vectorization
def lemma_tokenizer(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in nltk_stopwords]
    return lemma_tokens

In [20]:
nltk_stopwords = set(stopwords.words('english'))  # Retrieve English stopwords from NLTK
nltk_stopwords = list(stopwords.words('english'))  # Convert the set of stopwords to a list
vectorizer = TfidfVectorizer(stop_words=nltk_stopwords, tokenizer=lemma_tokenizer, ngram_range=(1,2), max_features=5024)
matrix_tfidf = vectorizer.fit_transform(text_list)

In [83]:
def get_relevant_phrase(text, mtx, text_list, relevantness=1.0, rel_random=0.05):
    # Transform the input text to TF-IDF vector
    query_vector = vectorizer.transform([text])
    # Calculate cosine similarities between the input text and all texts in the corpus
    cosine_similarities = cosine_similarity(query_vector, mtx).flatten()
    # Sort the indices of texts based on cosine similarities
    relevant_indices = np.argsort(cosine_similarities, axis=0)
    # Introduce randomness based on rel_random
    k_random = random.random() * rel_random
    relevantness = min(1, relevantness + k_random)
    # Calculate the index of the relevant text
    ind = relevant_indices[int((len(relevant_indices) - 1) * relevantness)]
    # Return the most relevant text
    return text_list[ind]

In [97]:
# Example usage
get_relevant_phrase("suit", matrix_tfidf, text_list)

' And the diving suit?'

In [84]:
def get_cosine_similarity_label(question, answer, vectorizer):
    question_vector = vectorizer.transform([question])
    answer_vector = vectorizer.transform([answer])


    cosine_sim = cosine_similarity(question_vector, answer_vector)[0][0]
    label = int(cosine_sim * 9)
    return label


In [87]:
def get_cosine_similarity_label(question, answer, vectorizer):
    question_vector = vectorizer.transform([question])
    answer_vector = vectorizer.transform([answer])

    # Add random noise to the cosine similarity
    cosine_sim = cosine_similarity(question_vector, answer_vector)[0][0]
    cosine_sim += np.random.uniform(-0.1, 0.1)

    # Clip the cosine similarity to ensure it's within [0, 1]
    cosine_sim = np.clip(cosine_sim, 0, 1)

    label = int(cosine_sim * 2)
    return label

In [92]:
def assign_label(row):
    q, a = row['Q'], row['A']
    relevant_phrase = get_relevant_phrase(q, matrix_tfidf, text_list)
    cosine_sim_label = get_cosine_similarity_label(q, relevant_phrase, vectorizer)
    return cosine_sim_label

In [98]:
# Apply the assign_label function to the dataframe and create a new column "label"
df = df.dropna()
df['label'] = df[['Context', 'A']].apply(assign_label, axis=1)

# Save the new dataframe with the new "label" column
df.to_csv('labeled_dataset.csv', index=False)