# Proof of Concept - StackAI
## Contents -
1. EDA of the dataset
2. Topic Relevance Search
3. Topic summarization

### Required modules

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.llms.openai import OpenAI
from langchain.chains.summarize import load_summarize_chain
from scipy.spatial.distance import cdist
import openai

### Required datasets

In [8]:
posts_df = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\posts.csv")

In [9]:
comments_df = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\comments.csv")

In [10]:
posts_df.head()

Unnamed: 0.1,Unnamed: 0,Id,PostTypeId,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,Tags,AnswerCount,CommentCount,AcceptedAnswerId,LastEditorUserId,ParentId
0,0,1,1,2016-01-12T18:45:19.963,10,424.0,<p>When I've printed an object I've had to cho...,16.0,2017-10-31T02:31:08.560,How to obtain high resolution prints in a shor...,<resolution><speed><quality>,2.0,6,51.0,,
1,1,2,1,2016-01-12T18:45:51.287,34,7377.0,"<p>I would like to buy a 3D printer, but I'm c...",20.0,2019-06-10T23:18:34.190,Is 3D printing safe for your health?,<print-material><safety><health>,4.0,1,12.0,334.0,
2,2,3,1,2016-01-12T18:46:22.083,18,2678.0,<p>I know the minimum layer height will effect...,11.0,2016-09-19T15:41:06.537,How important is the minimum layer height on a...,<quality><resolution>,3.0,5,152.0,11.0,
3,3,4,1,2016-01-12T18:50:55.973,18,384.0,<p>Plastic is used in 3D FDM/FFF printing part...,16.0,2016-06-10T13:32:20.493,Are there any metals that exhibit a large glas...,<fdm><material><print-material><metal-parts>,4.0,0,1289.0,98.0,
4,4,5,1,2016-01-12T18:53:53.623,40,3985.0,<p>What are the main differences when using AB...,11.0,2017-08-02T09:49:07.263,How is PLA different from ABS material?,<filament><abs><fdm><pla>,5.0,5,77.0,20.0,


In [11]:
comments_df.head()

Unnamed: 0.1,Unnamed: 0,Id,PostId,Score,Text,CreationDate,UserId,ContentLicense
0,0,1,1,4,Did I just place the first upvote? Congrats o...,2016-01-12T18:47:12.573,23.0,CC BY-SA 3.0
1,1,2,3,1,I think it would be a good idea to specify wha...,2016-01-12T19:01:35.180,10.0,CC BY-SA 3.0
2,2,3,1,1,What are you looking for in an answer? You are...,2016-01-12T19:42:08.613,26.0,CC BY-SA 3.0
3,3,4,21,9,It is worth noting here that acetone finishing...,2016-01-12T19:42:19.530,36.0,CC BY-SA 3.0
4,4,5,5,7,I think this is too broad for a good exhaustiv...,2016-01-12T19:44:02.667,36.0,CC BY-SA 3.0


In [12]:
posts_df.dtypes

Unnamed: 0            int64
Id                    int64
PostTypeId            int64
CreationDate         object
Score                 int64
ViewCount           float64
Body                 object
OwnerUserId         float64
LastActivityDate     object
Title                object
Tags                 object
AnswerCount         float64
CommentCount          int64
AcceptedAnswerId    float64
LastEditorUserId    float64
ParentId            float64
dtype: object

In [13]:
comments_df.dtypes

Unnamed: 0          int64
Id                  int64
PostId              int64
Score               int64
Text               object
CreationDate       object
UserId            float64
ContentLicense     object
dtype: object

In [14]:
posts_df.shape

(14475, 16)

In [15]:
comments_df.shape

(24320, 8)

In [16]:
# Rename columns in posts_df
posts_df = posts_df.rename(columns={
    'Id': 'post_id',
    'PostTypeId': 'posttypeid',
    'CreationDate': 'post_creationdate',
    'Score': 'post_score',
    'ViewCount': 'post_viewcount',
    'Body': 'post_body',
    'Title': 'post_title',
    'Tags': 'post_tags',
    'AcceptedAnswerId': 'post_acceptedanswerid'
})

# Rename columns in comments_df
comments_df = comments_df.rename(columns={
    'Text': 'comments_text',
    'Score': 'comments_score',
    'CreationDate': 'comments_creationdate'
})

### Mapping accepted answer with it's Post

In [17]:
# Join posts dataframe with itself on post_acceptedanswerid and post_id
posts_df = pd.merge(posts_df, posts_df, left_on='post_acceptedanswerid', right_on='post_id', how='left', suffixes=('', '_accepted'))

# Keep only the required columns
posts_df = posts_df[['post_id', 'posttypeid', 'post_creationdate', 'post_score', 
               'post_viewcount', 'post_body', 'post_title', 
               'post_tags',
               'post_body_accepted', 'post_score_accepted']]

### Data Filtering

### we are interested only in question and answer

In [18]:
# Keep only rows where posttypeid is 1 or 2.
#1 = question
#2 = answer
posts_df = posts_df[posts_df['posttypeid'].isin([1, 2])]

### For now let's consider only 3 tags (for testing)

In [19]:
# Keep only rows where post_tags is <2d>, <3d-design>, or null
mask = posts_df['post_tags'].isin(['<2d>', '<3d-design>']) | posts_df['post_tags'].isnull()
posts_df = posts_df[mask]

In [20]:
posts_df.shape

(8508, 10)

### Using small dataset for testing

In [22]:
#for testing purpose - use small dataset
# Remove rows with nan values in the post_title column
posts_df = posts_df.dropna(subset=['post_title'])

# Keep only the first 100 rows of the DataFrame
posts_df = posts_df.head(100)
posts_df.shape

(50, 10)

### Mapping comments with it's posts

In [23]:
# Merge the dataframes
df_result = pd.merge(posts_df, comments_df, left_on='post_id', right_on='PostId', how='left')

In [24]:
# Remove rows where comments_text, comments_score, or comments_creationdate is null
mask = df_result['comments_text'].notnull() & df_result['comments_score'].notnull() & df_result['comments_creationdate'].notnull()
df_result = df_result[mask]

In [25]:
# Keep only the required columns
df_result = df_result[['post_id', 'posttypeid', 'post_creationdate', 'post_score', 
                       'post_viewcount', 'post_body', 'post_title', 
                       'post_tags', 'post_body_accepted', 'post_score_accepted', 
                       'comments_text', 'comments_score', 
                       'comments_creationdate']]

In [26]:
# Create a posts dataframe
posts = df_result[['post_id', 'posttypeid', 'post_creationdate', 'post_score', 
                   'post_viewcount', 'post_body', 'post_title', 
                   'post_tags', 'post_body_accepted', 'post_score_accepted']].drop_duplicates()



In [27]:
# Create a comments dataframe
comments = df_result[['post_id', 'comments_text', 
                      'comments_score', 
                      'comments_creationdate']]

In [28]:
posts.shape

(30, 10)

In [29]:
comments.shape

(91, 4)

In [30]:
posts.to_csv('posts_cleaned.csv', index=False)

In [31]:
comments.to_csv('comments_cleaned.csv', index=False)

### Cleaned Posts table

In [33]:
posts.head()

Unnamed: 0,post_id,posttypeid,post_creationdate,post_score,post_viewcount,post_body,post_title,post_tags,post_body_accepted,post_score_accepted
0,548,1,2016-02-10T14:52:38.880,9,578.0,<p>Fair warning: I am a complete novice in 3-D...,Printing a CV joint,<3d-design>,,
7,1053,1,2016-04-27T13:12:32.580,1,455.0,"<p>In Google Sketchup, I have text on a surfac...",Pushing Text into surface - Google Sketchup,<3d-design>,<p>Definitely you have to perform substract op...,1.0
10,1264,1,2016-06-05T20:46:02.477,4,288.0,<p>What type of printer would one recommend to...,Car Body shop Printers,<3d-design>,,
16,1383,1,2016-06-18T09:54:45.630,9,2149.0,<p>Is there any simple way of creating tappere...,How to create tappered thread in OpenSCAD?,<3d-design>,<p>I have contacted Dan Kirshner (the author o...,4.0
18,2697,1,2016-08-28T00:32:36.523,-2,1752.0,<p>I am an absolute beginner when it comes to ...,What is a good software for designing car parts,<3d-design>,<p>You've avoided a number of attributes of a ...,2.0


### Cleaned Comments table

In [34]:
comments.head()

Unnamed: 0,post_id,comments_text,comments_score,comments_creationdate
0,548,A couple questions... Are you printing this yo...,0.0,2016-02-10T15:08:32.023
1,548,Thanks! I'm planning on using one of my school...,0.0,2016-02-10T17:23:12.223
2,548,There's a nice video of printing a ball bearin...,0.0,2016-02-13T19:58:22.803
3,548,Here are some uni lab blog entries from 2017 t...,0.0,2018-11-13T16:45:34.727
7,1053,Have you tried ungrouping the text? It's creat...,0.0,2016-04-27T17:52:56.217


In [35]:
posts.dtypes

post_id                  int64
posttypeid               int64
post_creationdate       object
post_score               int64
post_viewcount         float64
post_body               object
post_title              object
post_tags               object
post_body_accepted      object
post_score_accepted    float64
dtype: object

In [50]:
comments.dtypes

post_id                    int64
comments_text             object
comments_score           float64
comments_creationdate     object
dtype: object

In [52]:
posts['post_id'].isnull().sum()

0

## Feature 1 - Topic relevance search

#### User can ask a question regarding the category and it's tag
#### User's input is embeded is compared with the existing topics embeddings and returns the top 5 most similar topics

### SBert - cosine similarity

In [38]:
# Load the cleaned data from CSV into a pandas DataFrame
cleaned_data = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\posts_cleaned.csv")
comments_df = pd.read_csv("C:\\Users\\Dell\\OneDrive - Northeastern University\\courses\\big data and intl analytics\\DAMG7245-Summer2023\\final project\\dataset_converted\\3d_printing\\comments_cleaned.csv")
# Load the pre-trained Sentence Transformers model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [39]:
# Preprocessing function to convert tags from string to list
def process_tags(Tags):
    if pd.isna(Tags):  # Handle NaN values
        return []      # Return an empty list for NaN values
    return Tags.split(',')

# Apply the preprocessing to the 'Tags' column (uppercase "T")
cleaned_data['post_tags'] = cleaned_data['post_tags'].apply(process_tags)

def generate_embeddings(text):
    embeddings = model.encode(text)
    return embeddings

def filter_data(user_input_tag):
    # Keep rows with "posttypeid" as 1 or 2
    filtered_data = cleaned_data

    # Keep only data for the user input tag
    filtered_data = filtered_data[filtered_data['post_tags'].apply(lambda tags: user_input_tag in tags)]

    return filtered_data

In [40]:
def topic_relevance_search(user_input, user_input_tag):
    # Filter data based on user input tag and "PostTypeId"
    filtered_data = filter_data(user_input_tag)

    # Check if there is relevant data for the user input tag
    if filtered_data.empty:
        print(f"No relevant data found for the tag '{user_input_tag}'.")
        return []

    # Concatenate relevant columns for embedding, handling NaN values
    relevant_text = filtered_data.apply(lambda row: ' '.join(filter(lambda x: pd.notna(x), [row['post_title'], row['post_body']])), axis=1)

    # Convert relevant_text to a list
    relevant_text_list = relevant_text.tolist()

    # Generate embeddings for user input and relevant data
    user_embedding = generate_embeddings([user_input])
    data_embeddings = generate_embeddings(relevant_text_list)

    # Calculate cosine similarity between user input and data embeddings
    similarity_scores = cosine_similarity(user_embedding, data_embeddings)[0]

    # Sort the data by similarity scores in descending order and select top 5 rows
    top_indices = np.argsort(similarity_scores)[::-1][:5]
    
    top_similar_topics = filtered_data.iloc[top_indices][['post_id', 'post_title', 'posttypeid', 'post_body', 'post_body_accepted']].to_dict('records')
    
    top_similarity_scores = similarity_scores[top_indices]

    return top_similar_topics, top_similarity_scores

In [42]:
if __name__ == "__main__":
    # Example usage:
    user_question = "3d printing ring door bell"
    user_tag = "<3d-design>"
    

    similar_topics, similarity_scores = topic_relevance_search(user_question, user_tag)
    
    print("Most similar topics:")
    
    for i in range(len(similar_topics)):
        post_id, post_title, posttypeid, post_body, post_body_accepted = similar_topics[i]['post_id'], similar_topics[i]['post_title'], similar_topics[i]['posttypeid'], similar_topics[i]['post_body'], similar_topics[i]['post_body_accepted']
        
        print(f"{i+1}. Post ID: {post_id} : {post_title} (similarity score: {similarity_scores[i]:.2f})")

Most similar topics:
1. Post ID: 4672 : Designing back plate for Ring Doorbell (similarity score: 0.21)
2. Post ID: 20570 : How would this OLED be mounted if in a commercial device? (similarity score: 0.08)
3. Post ID: 8172 : How do I decide what size my push-fit feature should be? (similarity score: 0.07)
4. Post ID: 4681 : Make a nose cone in Fusion 360 (similarity score: 0.02)
5. Post ID: 16106 : Source of design advice for modeling functional parts (similarity score: -0.00)


In [43]:
selected_topic_post_id = 4681
    
selected_topic_index = next(i for i in range(len(similar_topics)) if similar_topics[i]['post_id'] == selected_topic_post_id)
    
selected_topic = similar_topics[selected_topic_index]
    
post_id, post_title, posttypeid, post_body, post_body_accepted = selected_topic['post_id'], selected_topic['post_title'], selected_topic['posttypeid'], selected_topic['post_body'], selected_topic['post_body_accepted']
    
print(f"\nSelected topic: {post_title}")
    
if pd.isna(post_body_accepted):
        print("No accepted answer")
        accepted_answer_text = ""
else:
        print(f"Accepted answer: {post_body_accepted}")
        accepted_answer_text = f"\n\nAccepted answer:\n{post_body_accepted}"
    
if posttypeid == 1:
        print(f"Description: {post_body}")
        post_text = f"\n\nDescription:\n{post_body}"
elif posttypeid == 2:
        print(f"Answer: {post_body}")
        post_text = f"\n\nAnswer:\n{post_body}"
    
post_comments = comments_df[comments_df['post_id'] == post_id]
    
comments_text = ""
if not post_comments.empty:
        print("Comments:")
        for comment in post_comments['comments_text']:
            print(f"\t- {comment}")
            comments_text += f"\n- {comment}"
    


Selected topic: Make a nose cone in Fusion 360
Accepted answer: <p>If you have a specific shape in mind and can create a sketch to represent that shape, you are halfway to your goal.</p>

<p>The concept is simple. Create a single line sketch that would represent the desired curve, starting from, in this example, the nose of the cone and traveling to the base. Create only one-half of the nose cone curve and maintain a "standard" axis reference, say, using the Y-axis as the rotation point.</p>

<p>The process is called <a href="http://help.autodesk.com/view/fusion360/ENU/?guid=GUID-5C62F370-7AB4-4DFD-BE61-F8830F30A6D7" rel="noreferrer">revolve. Fusion 360</a> supports this action directly.</p>

<pre><code>In the Sculpt workspace, choose Create Revolve.
Select the profile to revolve.
In the Revolve dialog:
    Click Axis and then select the axis to revolve around.
    Choose Full or Angle to specify whether the revolution is full or to a specific angle.
    For Direction choose One Side,

## Feature 2 - Topic Summarization

#### Gives a summary of the selected Post, it's comments and answer

### Langchain - OpenAI summarization

In [45]:
class Document:
    def __init__(self, page_content):
        self.page_content = page_content
        self.metadata = {}

In [46]:
data = f"{post_title}{post_text}{accepted_answer_text}\n\nComments:{comments_text}"
# Create a Document object from the data string
doc = Document(page_content=data)

In [49]:
# Initialize the OpenAI module and load the summarize chain
llm = OpenAI(temperature=0, openai_api_key="")
chain = load_summarize_chain(llm, chain_type="stuff")

# Use the string variable as input to the summarization chain
summary = chain.run(input_documents=[doc], question="Write a concise summary within 200 words.")
#print summary of post title
print(f"summary of the post - {post_title} : ", summary)

summary of the post - Make a nose cone in Fusion 360 :  
Using Fusion 360, one can create a nose cone for a model rocket by creating a single line sketch of the desired shape, then using the Revolve tool to create the cone. The sketch should start from the nose and travel to the base, and the base segment should be joined while the nose segments should be open and aligned to the Y-axis. Offsetting or hand sketching a parallel line that returns to the nose can add thickness. Tutorials and videos are available online for further guidance.


## Feature 3

### AI - Generated Answers

### Users can view an AI generated answer for their questions 
-  if the most similar topic doesn't have an existing accepted answer
-  if the user is not satisfied with the most similar topic itself
### The model utilizes the topic's dataset along with user's question to generate an answer

In [None]:
openai_api_key = ""
# User input
user_input = "how to design a back plate for a Ring Doorbell?"
# Concatenate the data: post_title, post_text, accepted_answer_text, and comments_text
data = f"{post_title}{post_text}{accepted_answer_text}\n\nComments:{comments_text}"
openai.api_key = openai_api_key

In [None]:
def get_openai_response(user_input, data):
    # Define the conversation history with system, user, and data messages
    conversation = f"You are a helpful assistant.\n\nUser: {user_input}\n\nData: {data}"

    # Use the OpenAI API to generate a response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "Give the answer to the user input using the data. If the answer doesn't exist in data, create your own answer."},
            {"role": "user", "content": user_input},
            {"role": "system", "content": f"Data: {data}"}
        ],
        temperature=0.2,
        max_tokens=500
    )

    # Extract and return the model-generated answer from the response
    return response["choices"][0]["message"]["content"].strip()

In [None]:
# Get the model's response based on the user input and the given data
answer = get_openai_response(user_input, data)

# Print the generated answer
print("AI: " + answer)

# xxxxx