<a href="https://colab.research.google.com/github/The-Loved-One/AIML-NITT-Chatbot/blob/patch-1/GPT2ChatbotAndMore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Approach using Cosine Similarity and Intent Classification

In [43]:
# Primitive Approach

import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Load the dataset from the JSON file
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        dataset = json.load(file)
    return dataset

# Preprocess the dataset to create lists of patterns, responses, and tags
def preprocess_dataset(dataset):
    patterns = []
    responses = []
    tags = []

    for entry in dataset:
        tag = entry['tag']
        for pattern in entry['patterns']:
            patterns.append(pattern)
            responses.append(entry['responses'][0])  # Take the first response for simplicity
            tags.append(tag)

    return patterns, responses, tags

# Vectorize the patterns using TF-IDF
def vectorize_patterns(patterns):
    vectorizer = TfidfVectorizer()
    pattern_vectors = vectorizer.fit_transform(patterns)
    return pattern_vectors, vectorizer

# Get the user query vector using the trained TF-IDF vectorizer
def vectorize_user_query(user_query, vectorizer):
    return vectorizer.transform([user_query])

# Calculate the similarity between the user query vector and pattern vectors
def calculate_similarity(user_query, pattern_vectors, vectorizer):
    # Vectorize the user's query
    user_vector = vectorizer.transform([user_query])

    # Calculate cosine similarity between the user's query and pattern vectors
    similarities = cosine_similarity(user_vector, pattern_vectors)

    return similarities

# Chatbot function
def chatbot(user_query, patterns, responses, tags, vectorizer, similarity_threshold=0.5):
    # Vectorize user query
    user_vector = vectorize_user_query(user_query, vectorizer)

    # Calculate similarity
    similarities = calculate_similarity(user_query, pattern_vectors, vectorizer)

    # Find the index of the best match
    best_match_index = np.argmax(similarities)

    # Check if the best match meets the similarity threshold
    best_similarity = similarities[best_match_index]
    if best_similarity > similarity_threshold:
        # Print the corresponding response and tag
        response = responses[best_match_index]
        tag = tags[best_match_index]
        print("Chatbot:", response, f"(Tag: {tag}, Similarity: {best_similarity:.2f})")
    else:
        print("Chatbot: I'm sorry, but I didn't understand that.")

# Allow the user to upload a JSON file
def upload_json():
    uploaded = False
    while not uploaded:
        try:
            file_path = '/content/DataSet.json'
            dataset = load_dataset(file_path)
            uploaded = True
        except FileNotFoundError:
            print("File not found. Please try again.")
    return dataset

def chat_loop():
    # Upload dataset
    dataset = upload_json()

    # Preprocess dataset
    patterns, responses, tags = preprocess_dataset(dataset)

    # Vectorize patterns
    pattern_vectors, vectorizer = vectorize_patterns(patterns)

    # Chat loop
    while True:
        user_query = input("User: ")
        if user_query.lower() == 'exit':
            print("Chatbot: Goodbye!")
            break

        # Calculate similarity using the correct variable
        similarities = calculate_similarity(user_query, pattern_vectors, vectorizer)

        # Find the index of the best match
        best_match_index = similarities.argmax()

        # Check if the best match meets the similarity threshold
        best_similarity = similarities[0, best_match_index]  # Adjusted indexing here
        if best_similarity > similarity_threshold:
            # Print the corresponding response and tag
            response = responses[best_match_index]
            tag = tags[best_match_index]
            print("Chatbot:", response, f"(Tag: {tag})")
        else:
            print("Chatbot: I'm sorry, but I didn't understand that.")

if __name__ == "__main__":
    # Start the chat loop
    chat_loop()


User: hey
Chatbot: Hi there! (Tag: greeting)
User: hey
Chatbot: Hi there! (Tag: greeting)
User: hey
Chatbot: Hi there! (Tag: greeting)
User: ck you
Chatbot: You're welcome! (Tag: thanks)
User: fuck you
Chatbot: I'm here to help, so please refrain from using inappropriate language. (Tag: swear)
User: exit
Chatbot: Goodbye!


# Pure GPT-2 Implementation

In [34]:
# GPT-2 Implementation. IDK how to use our dataset to train it instead.

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Start the interactive chat
print("Chatbot: Hi! I'm your chatbot. Type 'quit' to exit.")
while True:
    # Take user input
    user_input = input("User: ")

    # Check if the user wants to quit
    if user_input.lower() == 'quit':
        print("Chatbot: Goodbye!")
        break

    # Tokenize and encode the user's input
    input_ids = tokenizer.encode(user_input, return_tensors="pt")

    # Generate a response
    output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print the chatbot's response
    print("Chatbot:", response)

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Chatbot: Hi! I'm your chatbot. Type 'quit' to exit.
User: hi


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: hi.

"I don't know what to do," he said. "I'm just trying to get out of here. I'm not going anywhere. It's just a matter of time."
User: can you tell me where the library is located?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: can you tell me where the library is located?

I don't know. I'm not sure where it's located, but I can tell you that there are a lot of libraries in the city that are open to the public. There are
User: quit
Chatbot: Goodbye!


# Cosine Similarity + GPT-2

In [46]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json

def load_gpt2_model():
    model_name = "gpt2"
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    return model, tokenizer

def generate_response_gpt2(prompt, model, tokenizer, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7, do_sample=True)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

def load_dataset_from_json(json_path):
    with open(json_path, "r") as file:
        dataset = json.load(file)
    return dataset

def chat_loop_with_gpt2(model, tokenizer, dataset):
    print("Generative Chatbot: Hello! How can I help you today?")
    while True:
        user_query = input("User: ")
        if user_query.lower() in ["bye", "exit", "quit"]:
            print("Generative Chatbot: Goodbye! Have a great day.")
            break

        # Check if the user query is in the dataset
        matched_pattern, response = find_matching_pattern(user_query, dataset)

        if matched_pattern:
            # If the user query is in the dataset, use the predefined response as a base
            print("Generative Chatbot (Dataset):", response)
        else:
            # If the user query is not in the dataset, generate a response using GPT-2
            gpt2_response = generate_response_gpt2(user_query, model, tokenizer)
            # Correct punctuation and handle incomplete sentences
            gpt2_response = gpt2_response.replace('."', '".').replace('?."', '"?')
            print("Generative Chatbot (GPT-2):", gpt2_response)

def find_matching_pattern(user_query, dataset):
    # Check if the user query matches any pattern in the dataset
    for item in dataset:
        for pattern in item["patterns"]:
            if user_query.lower() in pattern.lower():
                return pattern, item["responses"][0]  # Use the first response as a base
    return None, None

if __name__ == "__main__":
    # Load GPT-2 model
    gpt2_model, gpt2_tokenizer = load_gpt2_model()

    # Load dataset from JSON file
    json_path = "/content/DataSet.json"  # Replace with the actual path to your JSON file
    dataset = load_dataset_from_json(json_path)

    # Start the chat loop with the generative approach
    chat_loop_with_gpt2(gpt2_model, gpt2_tokenizer, dataset)


Generative Chatbot: Hello! How can I help you today?
User: hi
Generative Chatbot (Dataset): The university is located at 123 University St.
User: hi
Generative Chatbot (Dataset): The university is located at 123 University St.
User: hello
Generative Chatbot (Dataset): Hello!
User: hello
Generative Chatbot (Dataset): Hello!
User: can you tell me some good places to eat near me


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generative Chatbot (GPT-2): can you tell me some good places to eat near me?"

"I don't know where I'm going to go," she said. "I'm not sure if I'll be able to find a place to stay. I've got a
User: can you tell me what the fee structure is?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generative Chatbot (GPT-2): can you tell me what the fee structure is?

I don't know. I'm not going to tell you. But if you want to know what it is, you have to go to the website. It's a free service.

User: exit
Generative Chatbot: Goodbye! Have a great day.


# Cosine Similarity + GPT-2 with Context Awareness

In [50]:
# Context Awareness

import json
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def load_dataset(file_path):
    with open(file_path, 'r') as file:
        dataset = json.load(file)
    return dataset

def preprocess_data(dataset):
    patterns, responses = [], []
    tags = []
    for item in dataset:
        for pattern in item['patterns']:
            patterns.append(pattern)
            responses.append(item['responses'][0])  # Using only the first response for simplicity
            tags.append(item['tag'])
    return patterns, responses, tags

def calculate_similarity(user_vector, pattern_vectors):
    similarities = cosine_similarity(user_vector, pattern_vectors)
    return similarities

def chatbot(user_query, patterns, responses, tags, vectorizer, model, tokenizer, context=None):
    if context is None:
        context = []

    # Add user input to context
    context.append(user_query)

    # Vectorize user input
    user_vector = vectorizer.transform([user_query])

    # Vectorize patterns
    pattern_vectors = vectorizer.transform(patterns)

    # Calculate similarity
    similarities = calculate_similarity(user_vector, pattern_vectors)

    # Print debugging information
    print("User Query:", user_query)
    print("Similarities:", similarities)

    # Find the index of the best match
    best_match_index = similarities.argmax()

    # Print debugging information
    print("Best Match Index:", best_match_index)

    # Check if the best match meets the similarity threshold
    best_similarity = similarities[best_match_index]
    similarity_threshold = 0.5  # You can adjust this threshold
    print("Best Similarity:", best_similarity)
    if best_similarity > similarity_threshold:
        # Print the corresponding response and tag
        response = responses[best_match_index]
        tag = tags[best_match_index]
        print("Generative Chatbot (Dataset):", response, f"(Tag: {tag})")
    else:
        # Use GPT-2 for generative responses
        generative_response = generate_response(user_query, context, model, tokenizer)
        print("Generative Chatbot (GPT-2):", generative_response)


def generate_response(user_query, context, model, tokenizer):
    # Combine user input with context
    input_text = ' '.join(context[-2:])  # Using the last two turns of conversation as input

    # Generate response using GPT-2
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

    # Decode and return the generated response
    generative_response = tokenizer.decode(output[0], skip_special_tokens=True)
    return generative_response

def chat_loop():
    dataset_path = "/content/DataSet.json"  # Replace with the actual path to your dataset
    dataset = load_dataset(dataset_path)
    patterns, responses, tags = preprocess_data(dataset)

    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    pattern_vectors = vectorizer.fit_transform(patterns)

    # Load GPT-2 model and tokenizer
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    print("Generative Chatbot: Hello! How can I help you today?")
    context = []

    while True:
        user_query = input("User: ")
        if user_query.lower() == 'exit':
            print("Generative Chatbot: Goodbye! Have a great day.")
            break

        chatbot(user_query, patterns, responses, tags, vectorizer, gpt2_model, gpt2_tokenizer, context)

if __name__ == "__main__":
    # Start the chat loop
    chat_loop()


Generative Chatbot: Hello! How can I help you today?
User: hi
User Query: hi
Similarities: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0

IndexError: ignored