In [6]:
import numpy as np
import integrated_chatbot
import string
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure necessary NLTK data packages are downloaded
integrated_chatbot.download('punkt')
integrated_chatbot.download('wordnet')
integrated_chatbot.download('omw-1.4')

# Reading the Corpus text
with open("hilancer.txt", 'r', errors='ignore') as f:
    raw_doc = f.read()

# Converting entire text to lowercase
raw_doc = raw_doc.lower()

# Removing colons and commas
raw_doc = re.sub(r'[:,]', '', raw_doc)

# Tokenizing the text into sentences and words
sentence_tokens = integrated_chatbot.sent_tokenize(raw_doc)
word_tokens = integrated_chatbot.word_tokenize(raw_doc)

# Split corpus into questions and answers
questions = []
answers = []
current_answer = []

for sentence in sentence_tokens:
    if sentence.endswith('?'):
        if questions:
            # Append the previous answer
            answers.append(' '.join(current_answer))
            current_answer = []
        questions.append(sentence)
    else:
        current_answer.append(sentence)

# Add the last answer if exists
if current_answer:
    answers.append(' '.join(current_answer))

# Ensure the number of answers matches questions
if len(answers) < len(questions):
    answers.append("I am sorry, I do not have an answer for that.")

# Text Preprocessing Steps
lemmer = integrated_chatbot.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punc_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(integrated_chatbot.word_tokenize(text.lower().translate(remove_punc_dict)))

# Initialize and fit the TF-IDF Vectorizer on the questions
TfidVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
tfidf = TfidVec.fit_transform(questions)

# Function to generate response
def response(user_response):
    robo_response = ''
    # Preprocess the user input
    user_response_processed = LemNormalize(user_response)
    user_response_processed = ' '.join(user_response_processed)  # Convert list back to string
    # Vectorize the user input
    tfidf_user = TfidVec.transform([user_response_processed])
    # Compute cosine similarity between user input and all questions
    vals = cosine_similarity(tfidf_user, tfidf)
    # Get the index of the most similar question
    idx = vals.argsort()[0][-1]
    # Get the highest similarity score
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-1]
    if req_tfidf == 0:
        robo_response = "I am sorry. Unable to understand you!"
    else:
        robo_response = answers[idx]
    return robo_response

# Defining the ChatFlow
def hilancer_chatbot():
    flag = True
    print(flag)
    print("Hello! I am Hi-Lancer ChatBot. Start typing your text after greeting to talk to me. To end the conversation, type 'bye'.")

    while flag:
        user_response = input("You: ").lower()

        if user_response != 'bye':
            if user_response in ('thank you', 'thanks'):
                flag = False
                print('Bot: You are welcome.')
            else:

                # Get the chatbot's response
                bot_response = response(user_response)

                # Print the chatbot's response
                print('Bot:', bot_response)
        else:
            flag = False
            print('Bot: Call me whenever you want help!')


# Run the chatbot
hilancer_chatbot()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ishav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ishav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ishav\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\ishav/nltk_data'
    - 'c:\\Program Files\\Python312\\nltk_data'
    - 'c:\\Program Files\\Python312\\share\\nltk_data'
    - 'c:\\Program Files\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\ishav\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
