# Load data

In [7]:
import pandas as pd

In [9]:
df1 = pd.read_csv("./data/data_generation_1.csv", index_col=0)
df1.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363696,363696,493696,493697,What causes hallucinations from weed?,How is Zong weed different from other weed?,0
127668,127668,205495,142438,Why is CNN biased?,"In what ways is CNN biased, if it is?",1
370413,370413,1596,50007,What are the safety precautions on handling sh...,What are the safety precautions on handling sh...,1
304378,304378,10280,11510,Why do long distance relationships fail?,How can I maintain my long distance relationsh...,1
261965,261965,145906,162176,What signs a girl gives when she is interested...,What are the signals that a girl gives if she'...,1


In [10]:
df2 = pd.read_csv("./data/data_generation_2.csv", index_col=0)
df2.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
252969,252969,32257,223578,Do EM Drives actually work?,Does the EM Drive work?,1
60858,60858,48171,6578,How do you know if you’re in love?,How do know that you are in love?,1
77214,77214,131874,131875,What is the difference between a mushroom trip...,How is it to trip on LSD?,0
230895,230895,340561,26783,What do you make of Kapil Sharma's tweet to Na...,What is your opinion on comedian Kapil Sharma'...,1
278257,278257,397497,397498,Which is really the fastest web browser for An...,What is the best web browser for Android?,0


In [12]:
df = pd.concat([df1, df2])
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363696,363696,493696,493697,What causes hallucinations from weed?,How is Zong weed different from other weed?,0
127668,127668,205495,142438,Why is CNN biased?,"In what ways is CNN biased, if it is?",1
370413,370413,1596,50007,What are the safety precautions on handling sh...,What are the safety precautions on handling sh...,1
304378,304378,10280,11510,Why do long distance relationships fail?,How can I maintain my long distance relationsh...,1
261965,261965,145906,162176,What signs a girl gives when she is interested...,What are the signals that a girl gives if she'...,1


In [16]:
df2 = pd.concat([df[["qid1", "question1"]].rename(columns={"qid1": "uid", "question1": "question"}),
                   df[["qid2", "question2"]].rename(columns={"qid2": "uid", "question2": "question"})])

In [17]:
df2.head()

Unnamed: 0,uid,question
363696,493696,What causes hallucinations from weed?
127668,205495,Why is CNN biased?
370413,1596,What are the safety precautions on handling sh...
304378,10280,Why do long distance relationships fail?
261965,145906,What signs a girl gives when she is interested...


# Sentence embeddings

In [19]:
from sentence_transformers import SentenceTransformer

In [20]:
model = SentenceTransformer("distiluse-base-multilingual-cased-v1", device="cpu")

In [28]:
model.get_parameter

<bound method Module.get_parameter of SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 512, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)>

In [47]:
df2.iloc[0]

uid                                        493696
question    What causes hallucinations from weed?
Name: 363696, dtype: object

In [24]:
test_sentence = df2.iloc[0]["question"]
test_sentence

'What causes hallucinations from weed?'

In [29]:
emb = model.encode(test_sentence)

In [30]:
emb.shape

(512,)

In [31]:
emb

array([ 1.20304665e-02, -2.67845690e-02, -5.76905161e-02,  2.86435038e-02,
       -7.30092525e-02, -5.36088319e-03, -2.13607438e-02,  1.69918612e-02,
       -4.15378958e-02, -1.25693917e-01,  2.88300700e-02,  6.61542341e-02,
        2.12030187e-02,  7.58857131e-02, -7.77882338e-02,  4.99526113e-02,
       -2.83105746e-02,  4.99075912e-02, -3.86627391e-02,  2.00616885e-02,
       -1.42113511e-02, -2.44045984e-02, -1.27259105e-01,  2.30313372e-02,
       -5.81556596e-02,  2.53441068e-03, -2.51099393e-02,  6.17368482e-02,
       -5.29357232e-02, -2.47612945e-03,  2.65411027e-02,  4.76179235e-02,
       -4.97157723e-02,  6.29093545e-03,  4.23260182e-02,  2.36963462e-02,
       -1.47621136e-03, -5.47419414e-02,  1.01822540e-02, -2.96115167e-02,
        4.93465662e-02,  2.56696972e-03, -5.81334752e-04, -4.84833680e-02,
        1.42848203e-02, -1.03855273e-02,  7.32477754e-02, -2.90084053e-02,
       -7.27411956e-02, -6.46058246e-02,  2.25927643e-02, -7.89794512e-03,
        2.28199195e-02,  

# Build index

In [21]:
from annoy import AnnoyIndex

In [33]:
dim_size = 512

In [40]:
df2.shape

(1109364, 2)

In [42]:
index = AnnoyIndex(dim_size, "angular")

max_size = 100000 
for i, (_, item) in enumerate(df2.iterrows()):
    uid = item.uid
    text = item.question
    
    emb = model.encode(text)
    index.add_item(uid, emb)
    if i > max_size:
        break

In [46]:
index.build(50)

You can't build a loaded index


Exception: You can't build a loaded index

In [45]:
index.save("quora.idx")

True

# Inference

## 1

In [54]:
nns = index.get_nns_by_item(493696, 10)
for nn in nns:
    print(df2.query("uid == {:d}".format(nn)).iloc[0]["question"])

What causes hallucinations from weed?
What causes sphincter paralysis?
What are the causes of having sleep paralysis every night?
How does marijuana affect neuroplasticity?
What causes addiction?
What causes paranoia?
What causes Sexsomnia?
Why does sleep paralysis happens?
What causes déja vu?
What causes dizziness after laughing?


In [55]:
nns = index.get_nns_by_item(10280, 10)
for nn in nns:
    print(df2.query("uid == {:d}".format(nn)).iloc[0]["question"])

Why do long distance relationships fail?
Why do people in relationships cheat?
Why are relationships so complicated these days?
Why do some people hate romantic relationships?
Why do we always end up complicating the relationships we don't want to complicate?
Why do we lose the ability to trust people after a breakup?
Why are most guys in relationships whipped?
Why do men cheat in a relationship?
Why are girls afraid of relationships?
Why do you think people fall in love?


## 2

In [57]:
text = "Why life is so difficult?"

In [58]:
emb = model.encode(text)
emb

array([ 2.79351603e-02,  3.26070860e-02,  5.16851097e-02, -1.80328898e-02,
       -3.64138260e-02, -5.04031926e-02, -5.84826767e-02, -7.54945911e-03,
        2.62992270e-03, -9.89049599e-02,  5.94008528e-03, -2.83751525e-02,
        1.12738479e-02,  1.96757037e-02, -1.19554540e-02,  7.06120729e-02,
       -3.07264011e-02,  7.99148679e-02,  3.72252166e-02, -4.04153578e-02,
       -1.86942965e-02, -2.77251508e-02,  4.98593710e-02, -3.25897448e-02,
        4.33603972e-02,  1.94987040e-02,  5.87214716e-03, -1.83762684e-02,
        1.11114923e-02, -3.07710916e-02,  3.66237350e-02,  2.19741538e-02,
       -2.78152991e-02, -3.43110226e-02,  2.69636642e-02, -3.19100618e-02,
        4.01353650e-02,  3.65341604e-02,  2.96291113e-02, -3.78221758e-02,
       -7.51352981e-02, -3.62595799e-03,  1.19700644e-03,  3.84565890e-02,
       -1.92979835e-02,  4.34329733e-02,  1.54138179e-02,  3.00020445e-02,
        1.74787939e-02, -2.19933614e-02, -5.61783789e-03, -1.93434563e-02,
       -1.07677979e-03, -

In [60]:
nns = index.get_nns_by_vector(emb, 10)
for nn in nns:
    print(df2.query("uid == {:d}".format(nn)).iloc[0]["question"])

Why is life so unfair and difficult?
Why do you think life sucks?
Why is life not worth living?
Why is life boring sometimes?
Why is life so meaningless and boring?
Why do you live your life?
Why do we live? Why do we try?
Why is life beautiful?
Why do people live?
Why is life so unfair & unjust?
