## Libraries

In [1]:
# libraries
import nltk #NLP lib
import numpy as np # numpy for arrays
import pandas as pd # pandas lib for data handling
from nltk.corpus import stopwords # common eng sstopwords
from nltk.stem import PorterStemmer #porter stemmer algo
from nltk.tokenize import sent_tokenize, word_tokenize
from bs4 import BeautifulSoup # HTML to txt

ps = PorterStemmer()
sw_nltk = stopwords.words('english') # bag of all common english stop words

## Supporting Functions

In [2]:
def TokStem(text,j): # Tokenise and Stem the given text
    words = [word for word in text.split() if word.lower() not in sw_nltk] # bag of tokenised words
    final = []
    for w in words:
        final.append(ps.stem(w))
    if j==1: # give a string
        return " ".join(final)
    return final
text = "When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."
x = TokStem(text,0)

In [3]:
def hbt(text): # convert HTML body to text
    return BeautifulSoup(text).get_text()

In [4]:
def rmTags(text): # convert tags to text
    st,out = [],[]
    for i in text:
        if i == '<':
            st = []
        elif i == '>':
            st += ' '
            out += st
        else:
            st += i
    return "".join(out[:-1]) # remove the last space

In [46]:
############ test code
x = "<xz><y>"
x += "<w>"
print(x)
y = rmTag(x)
print(y,len(y))
print(x[:-1])

<xz><y><w>
xz y w 6
<xz><y><w


In [5]:
def cos_sim(a,b): # finding cosine similarity score for given 2 strings
    # input two strings that are split into words
    us = [] # creating union set
    us += b
    for x in a:
        if x not in b:
            us.append(x)
    freqa,freqb,frequ = dict(),dict(),dict()
    wa,wb = [], []

    for word in a: # freq bag for string a
        if word in freqa:
            freqa[word] += 1
        else:
            freqa[word] = 1
    for word in b:# freq bag for string b
        if word in freqb:
            freqb[word] += 1
        else:
            freqb[word] = 1
    for word in us: # freq bag for union of a and b
        if word in frequ:
            frequ[word] += 1
        else:
            frequ[word] = 1
    
    for i in range(len(us)): # calc TitleVec for a and b
        x = us[i]
        if x in a:
            wa.append(freqa[x]/frequ[x])
        else:
            wa.append(0)
        if x in b:
            wb.append(freqb[x]/frequ[x])
        else:
            wb.append(0)
    wa = np.array(wa)
    wb = np.array(wb)
    return np.dot(wa,wb)/(np.linalg.norm(wa)*np.linalg.norm(wb)) # cosine value

## Loading the data 

In [6]:
df = pd.read_csv('QueryResults.csv')
print(type(df))
print(df.columns)
# Q = df[['OId','OTitle','OBody','OTags']]
# print(Q.size)
# R = df[[ 'DId', 'DTitle', 'DBody', 'DTags']]
# P = Q.append(R,ignore_index=True)
# print(P.size)
# print(Q)
# df = df.dropna(axis = 0)
#print(df.describe())
# df.drop(['CreationDate'], axis = 1)
print(df.columns)
# print(df.head())
X_features = ['Title','Body','Tags','Topic']
# a = df.loc[10].at["OBody"]
#print(a)
#print(BeautifulSoup(a).get_text()) # html to text 

<class 'pandas.core.frame.DataFrame'>
Index(['OId', 'CreationDate', 'OTitle', 'OBody', 'OTags', 'DId', 'DTitle',
       'DBody', 'DTags'],
      dtype='object')
Index(['OId', 'CreationDate', 'OTitle', 'OBody', 'OTags', 'DId', 'DTitle',
       'DBody', 'DTags'],
      dtype='object')


## Data Preprocessing

In [13]:
k = 500
# duplicate questions for training (with sorting)
Q2 = df[['OId','OTitle','OBody','OTags']].loc[0:k-1].copy() # original qns
Q1 = df[['DId','DTitle','DBody','DTags']].loc[0:k-1].copy() # doop qns
print(Q1.shape)
te = []
for i in range(k): # pickout original qns
    x = int(Q2[['OId']].loc[i])
    if x in te:
        Q2.drop([i], inplace = True) # remove rows
    else:
        te.append(x)
# print(Q2.shape)
Q2 = Q2.reset_index(drop=True) # restore the indices
# print(Q2.loc[10])

# # process the body wit tok, stem and modify tags and export
for i in range(Q2.shape[0]):
    x = TokStem(hbt(Q2.loc[i,'OBody']),1)
    y = TokStem(rmTags(Q2.loc[i,'OTags']),1)
    Q2.loc[i,'OBody'] = x
    Q2.loc[i,'OTags'] = y
for i in range(Q1.shape[0]):
    x = TokStem(hbt(Q1.loc[i,'DBody']),1)
    y = TokStem(rmTags(Q1.loc[i,'DTags']),1)
    Q1.loc[i,'DBody'] = x
    Q1.loc[i,'DTags'] = y


# Q2.to_csv('body_stem.csv') # export to CSV

(500, 4)


## Calculating Similarity component scores

In [35]:
b= ['OTitle','OBody','OTags']
a = ['DTitle','DBody','DTags']
SC = np.zeros([Q2.shape[0],Q1.shape[0],4]) # store all the component scores [doop,ori,4]
y = 162130
for i in range(Q2.shape[0]):
    for j in range(Q1.shape[0]):
        SC[i][j][0] = cos_sim(Q1.loc[j,a[0]].split(),Q2.loc[i,b[0]].split()) # TitleSC
        SC[i][j][1] = cos_sim(Q1.loc[j,a[1]].split(),Q2.loc[i,b[1]].split()) # BodySC
        SC[i][j][2] = cos_sim(Q1.loc[j,a[2]].split(),Q2.loc[i,b[2]].split()) # TagsSC


# Estimating parameters

In [51]:
sp = np.linspace(0,1,2)
gco = 0 # global count
ga,gb,gc = 0,0,0 # final parameter values
for a in sp:
    for b in sp:
        for c in sp:
            if a==0 and b==0 and c==0:
                continue
            tc = 0 # temp count
            for i in range(Q1.shape[0]): # iterate all duplicate qns
                te = []
                for j in range(Q2.shape[0]): # iterate over available qns
                    te.append(a*SC[j][i][0]+b*SC[j][i][1]+c*SC[j][i][2])
                x = te.index(max(te))
                if Q2.loc[x,'OId'] == df.loc[x,'OId']: # if correct original qn is detected
                    tc += 1
            if tc >= gco: # optimal value found
                ga,gb,gc = a,b,c
                gco = tc
                print(ga,gb,gc,tc)
            print(tc)
            
# print(SC[0][0:2])
# x = Q1.loc[0,a[1]]
# y = Q2.loc[0,b[0]]
# print(x,y)
# print(cos_sim(x,y))
# print(Q1.loc[0],Q1.loc[1])
# print(Q2.loc[0])

0.0 0.0 1.0 33
33
16
11
16
17
15
12


In [52]:
print(ga,gb,gc)

0.0 0.0 1.0
