In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the cleaned data from CSV into a pandas DataFrame
cleaned_data = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\posts_cleaned.csv")
comments_df = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\comments_cleaned.csv")
# Load the pre-trained Sentence Transformers model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [3]:
# Preprocessing function to convert tags from string to list
def process_tags(Tags):
    if pd.isna(Tags):  # Handle NaN values
        return []      # Return an empty list for NaN values
    return Tags.split(',')

# Apply the preprocessing to the 'Tags' column (uppercase "T")
cleaned_data['post_tags'] = cleaned_data['post_tags'].apply(process_tags)

def generate_embeddings(text):
    embeddings = model.encode(text)
    return embeddings

def filter_data(user_input_tag):
    # Keep rows with "posttypeid" as 1 or 2
    filtered_data = cleaned_data

    # Keep only data for the user input tag
    filtered_data = filtered_data[filtered_data['post_tags'].apply(lambda tags: user_input_tag in tags)]

    return filtered_data

In [4]:
def topic_relevance_search(user_input, user_input_tag, top_tags=10):
    # Filter data based on user input tag and "PostTypeId"
    filtered_data = filter_data(user_input_tag)

    # Check if there is relevant data for the user input tag
    if filtered_data.empty:
        print(f"No relevant data found for the tag '{user_input_tag}'.")
        return []

    # Concatenate relevant columns for embedding, handling NaN values
    relevant_text = filtered_data.apply(lambda row: ' '.join(filter(lambda x: pd.notna(x), [row['post_title'], row['post_body']])), axis=1)

    # Convert relevant_text to a list
    relevant_text_list = relevant_text.tolist()

    # Generate embeddings for user input and relevant data
    user_embedding = generate_embeddings([user_input])
    data_embeddings = generate_embeddings(relevant_text_list)

    # Calculate cosine similarity between user input and data embeddings
    similarity_scores = cosine_similarity(user_embedding, data_embeddings)[0]

    # Sort the data by similarity scores in descending order and select top 5 rows
    top_indices = np.argsort(similarity_scores)[::-1][:5]
    
    # Merge filtered_data and comments_df on the post_id column
    merged_data = pd.merge(filtered_data, comments_df, on='post_id', how='left')
    
    top_similar_topics = merged_data.iloc[top_indices][['post_title', 'comments_text', 'posttypeid', 'post_body', 'post_body_accepted']].to_dict('records')
    top_similarity_scores = similarity_scores[top_indices]

    return top_similar_topics, top_similarity_scores




In [5]:
if __name__ == "__main__":
    # Example usage:
    user_question = "what is stl file?"
    user_tag = "<3d-design>"

    similar_topics, similarity_scores = topic_relevance_search(user_question, user_tag)
    
    for i in range(len(similar_topics)):
        print(f"{i+1}. {similar_topics[i]['post_title']} (similarity score: {similarity_scores[i]:.2f})")
        post_type = "Question" if similar_topics[i]['posttypeid'] == 1 else "Answer"
        print(f"{post_type}: {similar_topics[i]['post_body']}")
        if pd.notna(similar_topics[i]['post_body_accepted']):
            print(f"Accepted Answer: {similar_topics[i]['post_body_accepted']}")
        else:
            print("No accepted answer.")
        print("Comments:")
        comments_text = similar_topics[i]['comments_text']
        if comments_text is not None:
            for j, comment in enumerate(comments_text.split('\n')):
                print(f"  {j+1}. {comment}")
        else:
            print("  No comments found.")

1. What is a good software for designing car parts (similarity score: 0.26)
Question: <p>I am an absolute beginner when it comes to 3d printing. I want to get into the hobby by designing aero automotive parts such as fender flares, custom gauge and switch pods, lips etc. I've never taken a CAD course but I would say I'm proficient enough with computers as I work as a developer. My uncle is an architect and it seems like autocad might be something good to go with. What are some other good alternatives that allow accurate modeling down to millimeters and possibly breaking larger objects into smaller 3d printable pieces to mash together? </p>

Accepted Answer: <p>You've avoided a number of attributes of a poor question by specifying your objective in detail, while also providing some indication of your experience. AutoCAD is not well known as being 3d printer friendly, although a skilled AutoCAD design user may be able to create acceptable models.</p>

<p>One could consider more organic m