In [274]:
import nltk
import re
import os
import logging
import math
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [275]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [276]:
def read_cardocuments(path):

    content = {}
    doc_id_filename = {}
    doc_id = 0
    for filename in os.listdir(path):
        if filename.endswith(".txt"):
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                content[doc_id] = file.read()
                doc_id_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return content, doc_id_filename

In [277]:
def tokenize(text):

    if isinstance(text, str):
        return text.lower().split()
    else:
        return []

In [278]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [279]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [280]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [281]:
def convert_doc_ids_to_filenames(doc_ids, doc_id_to_filename):
    # Indent the return statement to be part of the function body
    return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

In [282]:
def cosine_similarity(vec1, vec2):
    # Reshape to 1D if necessary to avoid shape issues
    vec1 = vec1.reshape(-1)  # Convert to 1D
    vec2 = vec2.reshape(-1)  # Convert to 1D

    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2 + 1e-8)

In [283]:
def clean_text(text):

    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens  # Return the list of cleaned tokens



In [284]:
def load_cars(folder_path):

    car_data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            # Try opening with 'latin-1' encoding
            try:
                with open(os.path.join(folder_path, filename), 'r', encoding='latin-1') as f:
                    content = f.read()
            # If 'latin-1' fails, try 'utf-8'
            except UnicodeDecodeError:
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                    content = f.read()

            car_data[doc_id] = content
            doc_id_to_filename[doc_id] = filename
            doc_id += 1
    return car_data, doc_id_to_filename


In [285]:
def extract_car_data(text):
    """Extracts car data (Make, Model, Year, Price, Mileage) from text."""
    car_data = {}
    # Define regex patterns to match car data fields
    make_model_pattern = re.compile(r"(?i)(make|model):\s*([^\n,]+)")
    year_pattern = re.compile(r"(?i)year:\s*(\d{4})")
    price_pattern = re.compile(r"(?i)price:\s*\$?([\d,]+)")
    mileage_pattern = re.compile(r"(?i)mileage:\s*([\d,]+)")

    # Search for matches in the text
    make_model_match = make_model_pattern.findall(text)
    year_match = year_pattern.search(text)
    price_match = price_pattern.search(text)
    mileage_match = mileage_pattern.search(text)

    # Extract and store the data
    for key, value in make_model_match:
        car_data[key.lower()] = value.strip()
    if year_match:
        car_data["year"] = year_match.group(1)
    if price_match:
        car_data["price"] = price_match.group(1)
    if mileage_match:
        car_data["mileage"] = mileage_match.group(1)

    return car_data

In [286]:
def process_queries(query, all_documents, doc_tfidf_vectors, vocab, top_k=3):

    tokenized_query = clean_text(query)
    query_vector = compute_tfidf(tokenized_query, all_documents, vocab)
    similarities = []
    for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
        # The cosine_similarity function returns a scalar, so no need to index
        similarity = cosine_similarity(query_vector.reshape(1, -1), doc_vector.reshape(1, -1))
        similarities.append((doc_id, similarity))

    def similarity_comparator(pair):
        return pair[1]

    similarities.sort(key=similarity_comparator, reverse=True)
    return similarities[:top_k]

In [301]:
def main():
    folder_path = "/content/drive/MyDrive/Tech400_final_project"
    print("Retrieving Car Data...")
    car_data, doc_id_to_filename = load_cars(folder_path)
    queries = input("Enter your queries: ")

    tokenized_docs = [clean_text(doc) for doc in car_data.values()]
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))
    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

    print(f"Searching for the result:'{queries}' ")
    similarities = process_queries(queries, tokenized_docs, doc_tfidf_vectors, vocab)
    results = [(queries, similarities)]

    if results:
        print("\nTop 3 Matching Cars:")
        for idx, (doc_id, score) in enumerate(results[0][1], 1):
            car_info = extract_car_data(car_data[doc_id])
            car_name = os.path.splitext(doc_id_to_filename[doc_id])[0]
            print(f"Car {idx}: {car_name},Score: {score:.4f}")
            # Display additional car details based on extracted information
            if 'safety_ratings' in car_info:
                print(f"  Safety Rating: {car_info['safety_ratings']}")
            if 'family_friendly' in car_info:
                print(f" Family-Friendly: Yes")
            if 'suv' in car_info:
                print(f"  SUV: Yes")
            if 'luxury' in car_info:
                print(f"  Luxury: Yes")
    else:
        print("No matching cars found.")

if __name__ == "__main__":
    main()

Retrieving Car Data...
Enter your queries: Horsepower
Searching for the result:'Horsepower' 

Top 3 Matching Cars:
Car 1: Range Rover Sport SVR ,Score: 0.0047
Car 2: Jaguar F-PACE SVR ,Score: 0.0046
Car 3: Mercedes-AMG GLE 63 S ,Score: 0.0044
