In [23]:
# Install necessary libraries if not already installed
# !pip install pandas numpy langchain faiss-cpu scikit-learn

# Import required libraries
import pandas as pd
import numpy as np
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from sklearn.preprocessing import LabelEncoder
from langchain.document_loaders import CSVLoader


In [24]:

# Load the generated datasets
user_data = pd.read_csv("../data/user_data.csv")
booking_history = pd.read_csv("../data/booking_history.csv")
train_schedule = pd.read_csv("../data/train_schedule.csv")
pricing_data = pd.read_csv("../data/pricing_data.csv")

In [25]:
# Display dataset shapes
print(f"User Data: {user_data.shape}")
print(f"Booking History: {booking_history.shape}")
print(f"Train Schedule: {train_schedule.shape}")
print(f"Pricing Data: {pricing_data.shape}")


User Data: (5, 6)
Booking History: (100, 7)
Train Schedule: (100, 7)
Pricing Data: (100, 6)


# -------------------------------
# Step 1: Data Preprocessing
# -------------------------------


In [26]:

# Merge booking history with user data
df = booking_history.merge(user_data, on="User ID", how="left")

# Merge with train schedule (using Train Name)
df = df.merge(train_schedule, left_on="Train Name", right_on="Train Name", how="left")

# Merge with pricing data on Train Name & Class
df = df.merge(pricing_data, on="Train Name", how="left")

# Merge with transaction data using User ID
df = df.merge(transaction_data, on="User ID", how="left")

# Drop unnecessary columns
df.drop(columns=["Date"], inplace=True, errors="ignore")

# Fill missing values
df.fillna("Unknown", inplace=True)

# Ensure all expected columns exist before applying Label Encoding
available_categorical_features = [col for col in [
    "Gender", "Location", "Preferred Class", "Loyalty Status",
    "Train Name", "Departure", "Arrival", "Seat Preference", "Payment Method"
] if col in df.columns]  # Check existence before applying encoding


In [27]:
label_encoders = {}

for feature in available_categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Convert numerical columns to float before normalization
numerical_features = ["Age", "Ticket Price", "Base Price", "Surge Price", "Final Price", "Duration (hrs)"]

for col in numerical_features:
    if col in df.columns:  # Ensure column exists before applying transformation
        df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert to float, setting errors='coerce' will handle bad data
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())  # Normalize

# Convert all columns to string for embedding processing
df["combined_features"] = df.astype(str).agg(" ".join, axis=1)




# ---------------------------------
# Step 2: Create LangChain Embeddings
# ---------------------------------


In [None]:
# Initialize LangChain OpenAI Embeddings
embedding_model = OpenAIEmbeddings()

# Convert text data to embeddings
train_data = df["combined_features"].tolist()
embeddings = embedding_model.embed_documents(train_data)

# Create FAISS index for similarity search
vector_store = FAISS.from_embeddings(embeddings)

# Save the FAISS index
vector_store.save_local("../deployment/faiss_index")

print("Model training complete. FAISS index saved.")


# ---------------------------------
# Step 3: Model Testing (Simple Recommendation)
# ---------------------------------


In [None]:

def recommend_tickets(user_input):
    """
    Function to recommend train tickets based on user query.
    """
    input_embedding = embedding_model.embed_query(user_input)
    similar_docs = vector_store.similarity_search_by_vector(input_embedding, k=5)

    recommendations = [doc.metadata for doc in similar_docs]
    return recommendations

# Example test
sample_user_query = "Business class ticket from CityA to CityB with a window seat"
recommendations = recommend_tickets(sample_user_query)

print("\nTop Recommended Tickets:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")
