In [None]:
pip install torch transformers


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file
df = pd.read_csv('clean_details.csv')

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode text using BERT
def bert_encode(text):
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        output = model(**encoded_input)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # Output of the [CLS] token

# Extract the question and the answers
question = df.columns[1]  # Assuming the question is in the second column header
answers = df[question].tolist()

# Encode the question and answers
question_vector = bert_encode(question)
answer_vectors = [bert_encode(answer) for answer in answers]

# Calculate cosine similarities
similarities = [cosine_similarity([question_vector], [answer_vector])[0][0] for answer_vector in answer_vectors]


# Print the results
for answer, similarity in zip(answers, similarities):
    print(f"Answer: {answer} - Similarity: {similarity}")



Answer: Clean base boards & toilet seat hinges - Similarity: 0.7144755125045776
Answer: a clean bathroom. particularly, mirrors. - Similarity: 0.8628876805305481
Answer: Toilet, or area behind the toilet is a tell-all. I’d also add in baseboards. - Similarity: 0.824443519115448
Answer: Shoes off when people enter leaving them at the front door - Similarity: 0.9035292863845825
Answer: I am short as well and was horrified at how much dust I missed bc I genuinely can't see it. I took one of those long duster things on the extending pole to the whole house recently and that helped me get those higher areas much easier - Similarity: 0.7121673822402954
Answer: This is what I do too! Super easy. Then I just take my Swiffer duster to pick up any remaining bits. - Similarity: 0.8771483302116394
Answer: Thanks - Similarity: 0.9147782921791077
Answer: Thank you for this! Totally going to be my new way to do it. - Similarity: 0.9010915160179138
Answer: Leg warmers are the perfect fit for fan blade

In [None]:
df['sim'] = similarities

In [None]:
df[df['sim'] == max(df['sim'])]

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",sim
1297,936,isn't that just a swiffer sweeper?,0.956371


In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd


# Create a list of tagged documents
tagged_data = [TaggedDocument(words=[_d.lower() for _d in doc.split()], tags=[str(i)]) for i, doc in enumerate([question] + answers)]

# Train a Doc2Vec model
model = Doc2Vec(vector_size=50, min_count=1, epochs=40)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Infer vectors
question_vec = model.infer_vector(question.split())
answer_vecs = [model.infer_vector(answer.split()) for answer in answers]

# Calculate cosine similarities
similarity_scores = cosine_similarity([question_vec], answer_vecs).flatten()

df['doc_ssim'] = similarity_scores
# Print results
for answer, score in zip(answers, similarity_scores):
    print(f"Answer: {answer} - Similarity: {score}")


Answer: Clean base boards & toilet seat hinges - Similarity: 0.10916729271411896
Answer: a clean bathroom. particularly, mirrors. - Similarity: 0.37643226981163025
Answer: Toilet, or area behind the toilet is a tell-all. I’d also add in baseboards. - Similarity: 0.07989636808633804
Answer: Shoes off when people enter leaving them at the front door - Similarity: 0.2204776406288147
Answer: I am short as well and was horrified at how much dust I missed bc I genuinely can't see it. I took one of those long duster things on the extending pole to the whole house recently and that helped me get those higher areas much easier - Similarity: 0.01988755166530609
Answer: This is what I do too! Super easy. Then I just take my Swiffer duster to pick up any remaining bits. - Similarity: -0.003890983760356903
Answer: Thanks - Similarity: 0.10116450488567352
Answer: Thank you for this! Totally going to be my new way to do it. - Similarity: -0.07773862779140472
Answer: Leg warmers are the perfect fit fo

In [None]:
df[df['doc_ssim'] == max(df['doc_ssim'])]

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",sim,doc_ssim
446,17,"Homeowners who apologize for ""the mess"" in the...",0.438188,0.811076


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([question] + answers)  # Include question as part of the corpus

# Calculate cosine similarities (omit the first vector as it is the question itself)
similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]).flatten()

df['tf_idf'] = similarity_scores

In [None]:
df[df['tf_idf'] == max(df['tf_idf'])]

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",sim,doc_ssim,tf_idf
505,222,Which small detail in a home makes you think*,0.929212,0.630366,0.658839


In [None]:
from transformers import AutoModel, AutoTokenizer

# For instance, loading a BERT model, which is conceptually different but serves similar NLP tasks
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def bert_encode(text):
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        output = model(**encoded_input)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # Output of the [CLS] token

# Extract the question and the answers
question = df.columns[1]  # Assuming the question is in the second column header
answers = df[question].tolist()

# Encode the question and answers
question_vector = bert_encode(question)
answer_vectors = [bert_encode(answer) for answer in answers]

# Calculate cosine similarities
similarities = [cosine_similarity([question_vector], [answer_vector])[0][0] for answer_vector in answer_vectors]
df['autobert'] = similarities

In [None]:
df[df['autobert'] == max(df['autobert'])]

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",sim,doc_ssim,tf_idf,autobert
1297,936,isn't that just a swiffer sweeper?,0.956371,0.413723,0.02915,0.956371


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
df = pd.read_csv('clean_details.csv')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.config.output_hidden_states = True  # Ensure hidden states are returned

# Set padding token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Function to encode text using GPT-2
def gpt2_encode(text):
    encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**encoded_input)
    last_hidden_state = outputs.hidden_states[-1]

    return last_hidden_state.mean(dim=1).squeeze().numpy()
question = df.columns[1]  # Assuming the question is in the second column header
answers = df[question].tolist()

# Encode the question and answers
question_vector = gpt2_encode(question)
answer_vectors = [gpt2_encode(answer) for answer in answers]

# Calculate cosine similarities
similarities = [cosine_similarity([question_vector], [answer_vector])[0][0] for answer_vector in answer_vectors]
df['gpt2'] = similarities


In [6]:
df[df['gpt2'] == max(df['gpt2'])]

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",gpt2
1135,790,I’ve been to a few bachelor pads where you can...,0.999957


In [7]:
df.head()

Unnamed: 0,Detail,"What’s the small detail in a home that makes you think “oh, these people are very clean”?",gpt2
0,0,Clean base boards & toilet seat hinges,0.977727
1,1,"a clean bathroom. particularly, mirrors.",0.999681
2,10,"Toilet, or area behind the toilet is a tell-al...",0.999859
3,100,Shoes off when people enter leaving them at th...,0.999648
4,1000,I am short as well and was horrified at how mu...,0.999021


In [8]:
# import pandas as pd
# from gensim.models import KeyedVectors
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# # Load Google's pre-trained Word2Vec model.
# model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)

# # Define a function to convert sentences to mean vectors
# def sentence_to_vec(sentence):
#     words = sentence.split()
#     word_vecs = [model[word] for word in words if word in model.vocab]
#     if len(word_vecs) > 0:
#         return np.mean(word_vecs, axis=0)
#     else:
#         return np.zeros(model.vector_size)

# # Encode the question and answers
# question_vec = sentence_to_vec(question)
# answer_vecs = np.array([sentence_to_vec(answer) for answer in answers])

# # Calculate cosine similarities
# similarity_scores = cosine_similarity([question_vec], answer_vecs).flatten()

# df['w2v'] = similarity_scores


In [None]:
# df[df['w2v'] == max(df['w2v'])]