In [11]:
import pandas as pd
import numpy as np
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
import joblib

# Load the dataset
data = pd.read_csv("part2.csv")

# Prepare the dataset for training
X = data['Questions\t']  # Input feature (user queries)
y = data['Response Category']  # Target feature (intent)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for TF-IDF vectorization and model training
model = make_pipeline(TfidfVectorizer(), LogisticRegression())
model.fit(X_train, y_train)

# Save the model for later use
joblib.dump(model, "chatbot_model1.pkl")

# Load the vectorizer separately to use for cosine similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Questions\t'])  # Transform all questions in the dataset

# Define response logic with cosine similarity
def get_response(user_input):
    # Transform user input using the same TF-IDF vectorizer
    user_input_vector = tfidf_vectorizer.transform([user_input])
    
    # Calculate cosine similarity between user input and all questions in the dataset
    similarities = cosine_similarity(user_input_vector, tfidf_matrix)
    
    # Get the index of the most similar question
    best_match_index = np.argmax(similarities, axis=1)[0]
    
    # Retrieve the answer corresponding to the best match
    response = data.iloc[best_match_index]['Answers']
    
    return response

# Streamlit app for the chatbot
st.title("Travel Chatbot")
st.write("Ask me anything about traveling in India!")

# User input
user_input = st.text_input("You: ")

if user_input:
    response = get_response(user_input)
    st.write(f"Bot: {response}")
