In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

duplicate_dataset = pd.read_csv('./drive/MyDrive/DuplicateQuestions.csv')
duplicate_dataset = duplicate_dataset.sample(frac=0.2)

posts = duplicate_dataset.OBody.tolist()
titles = duplicate_dataset.OTitle.tolist()
tags = duplicate_dataset.OTags.tolist()

duplicated_posts = duplicate_dataset.DBody.tolist()
duplicated_titles = duplicate_dataset.DTitle.tolist()
duplicated_tags = duplicate_dataset.DTags.tolist()

In [None]:
not_duplicated_posts = []
not_duplicated_titles = []
not_duplicated_tags = []

not_duplicate_dataset = pd.read_csv('./drive/MyDrive/NotDuplicateQuestions.csv')
not_duplicate_dataset = not_duplicate_dataset.sample(frac=0.2)

not_duplicated_posts = not_duplicate_dataset.Body.tolist()
not_duplicated_titles = not_duplicate_dataset.Title.tolist()
not_duplicated_tags = not_duplicate_dataset.Tags.tolist()

# pre-processing

- removing html tags

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(inputs):
    for i in range(len(inputs)):
        inputs[i] = BeautifulSoup(inputs[i], "lxml").text
        
# ------------------ Original questions from first dataset ---------------------
remove_html_tags(posts)
remove_html_tags(titles)
remove_html_tags(tags)

# ------------------ Duplicated questions from first dataset ------------------
remove_html_tags(duplicated_posts)
remove_html_tags(duplicated_titles)
remove_html_tags(duplicated_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
remove_html_tags(not_duplicated_posts)
remove_html_tags(not_duplicated_titles)
remove_html_tags(not_duplicated_tags)

- lower case

In [None]:
def lower_case(inputs):
    for i in range(len(inputs)):
        inputs[i] = inputs[i].lower()
        
# ------------------ Original questions from first dataset ---------------------
lower_case(posts)
lower_case(titles)
lower_case(tags)

# ------------------ Duplicated questions from first dataset ------------------
lower_case(duplicated_posts)
lower_case(duplicated_titles)
lower_case(duplicated_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
lower_case(not_duplicated_posts)
lower_case(not_duplicated_titles)
lower_case(not_duplicated_tags)

- removing numbers

In [None]:
import re

def remove_numbers(inputs):
    for i in range(len(inputs)):
        inputs[i] = re.sub(r'\d+', '', inputs[i])

# ------------------ Original questions from first dataset ---------------------
remove_numbers(posts)
remove_numbers(titles)
remove_numbers(tags)

# ------------------ Duplicated questions from first dataset ------------------
remove_numbers(duplicated_posts)
remove_numbers(duplicated_titles)
remove_numbers(duplicated_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
remove_numbers(not_duplicated_posts)
remove_numbers(not_duplicated_titles)
remove_numbers(not_duplicated_tags)

- removing punctuation

In [None]:
import re

def remove_punct(inputs):
    for i in range(len(inputs)):
        inputs[i] = re.sub(r'[^\w\s]', '', inputs[i])

# ------------------ Original questions from first dataset ---------------------
remove_punct(posts)
remove_punct(titles)
remove_punct(tags)

# ------------------ Duplicated questions from first dataset ------------------
remove_punct(duplicated_posts)
remove_punct(duplicated_titles)
remove_punct(duplicated_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
remove_punct(not_duplicated_posts)
remove_punct(not_duplicated_titles)
remove_punct(not_duplicated_tags)

- Tokenization & removing stop words

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def remove_stop_words(inputs):
    tokenized_list = []
    for i in range(len(inputs)):
        tokens = word_tokenize(inputs[i])
        tokenized_list.append([j for j in tokens if j not in stop_words])
    return tokenized_list

# ------------------ Original questions from first dataset ---------------------
tokenized_posts = remove_stop_words(posts)
tokenized_titles = remove_stop_words(titles)
tokenized_tags = remove_stop_words(tags)

# ------------------ Duplicated questions from first dataset ------------------
tokenized_duplicate_posts = remove_stop_words(duplicated_posts)
tokenized_duplicate_titles = remove_stop_words(duplicated_titles)
tokenized_duplicate_tags = remove_stop_words(duplicated_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
tokenized_n_d_posts = remove_stop_words(not_duplicated_posts)
tokenized_n_d_titles = remove_stop_words(not_duplicated_titles)
tokenized_n_d_tags = remove_stop_words(not_duplicated_tags)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


- lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatization(tokenized_input):
    for i in range(len(tokenized_input)):
        result = []
        for word in tokenized_input[i]:
            lemmatizer.lemmatize(word)
            result.append(lemmatizer.lemmatize(word))
        tokenized_input[i] = result
    
# ------------------ Original questions from first dataset ---------------------
lemmatization(tokenized_posts)
lemmatization(tokenized_titles)
lemmatization(tokenized_tags)

# ------------------ Duplicated questions from first dataset ------------------
lemmatization(tokenized_duplicate_posts)
lemmatization(tokenized_duplicate_titles)
lemmatization(tokenized_duplicate_tags)

# ------------------ Not Duplicated questions from second dataset ------------------
lemmatization(tokenized_n_d_posts)
lemmatization(tokenized_n_d_titles)
lemmatization(tokenized_n_d_tags)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


- Concatanation

In [None]:
def concat(tokenized_input):
    result = []
    for t in tokenized_input:
        result.append(' '.join(t))
    return result

posts = concat(tokenized_posts)
titles = concat(tokenized_titles)
tags = concat(tokenized_tags)
duplicatePosts = concat(tokenized_duplicate_posts)
duplicateTitles = concat(tokenized_duplicate_titles)
duplicateTags = concat(tokenized_duplicate_tags)

#
posts__ = concat(tokenized_n_d_posts)
titles__ = concat(tokenized_n_d_titles)
tags__ = concat(tokenized_n_d_tags)

In [None]:
# allPost = posts + duplicated_posts + posts__
allTokenizedPosts = tokenized_posts + tokenized_duplicate_posts + tokenized_n_d_posts

# Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

documents = [TaggedDocument(d, [i]) for i, d in enumerate(allTokenizedPosts)]
doc2vec_model = Doc2Vec(documents, vector_size=20, window=2, min_count=1, workers=4, epochs = 100)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from scipy import spatial

duplicated_posts_features = []
for i in range(len(posts)):
    result = []

    vector1 = doc2vec_model.infer_vector(posts[i].split())
    vector2 = doc2vec_model.infer_vector(duplicatePosts[i].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(titles[i].split())
    vector2 = doc2vec_model.infer_vector(duplicateTitles[i].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(tags[i].split())
    vector2 = doc2vec_model.infer_vector(duplicateTags[i].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(posts[i].split())
    vector2 = doc2vec_model.infer_vector(duplicateTitles[i].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(titles[i].split())
    vector2 = doc2vec_model.infer_vector(duplicatePosts[i].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)

    duplicated_posts_features.append(result)

In [None]:
not_duplicated_posts_features = []
for i in range(0, len(posts__), 2):
    result = []

    vector1 = doc2vec_model.infer_vector(posts__[i].split())
    vector2 = doc2vec_model.infer_vector(posts__[i+1].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(titles__[i].split())
    vector2 = doc2vec_model.infer_vector(titles__[i+1].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)

    vector1 = doc2vec_model.infer_vector(tags__[i].split())
    vector2 = doc2vec_model.infer_vector(tags__[i+1].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)
    
    vector1 = doc2vec_model.infer_vector(posts__[i].split())
    vector2 = doc2vec_model.infer_vector(titles__[i+1].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)

    vector1 = doc2vec_model.infer_vector(titles__[i].split())
    vector2 = doc2vec_model.infer_vector(posts__[i+1].split())
    cosine_distance = spatial.distance.cosine(vector1, vector2)
    result.append(cosine_distance)

    not_duplicated_posts_features.append(result)

# lda

In [None]:
from gensim.corpora import Dictionary
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore

dictionary = Dictionary(allTokenizedPosts)

corpus = [dictionary.doc2bow(post) for post in allTokenizedPosts]

lda_model =  LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=10)

In [None]:
from gensim.matutils import cossim

corpus_p = [dictionary.doc2bow(post) for post in tokenized_posts]
corpus_dup_p = [dictionary.doc2bow(post) for post in tokenized_duplicate_posts]

counter = 0
for i in range(len(corpus_p)):
    vector1 = lda_model.get_document_topics(corpus_p[i], minimum_probability=0)
    vector2 = lda_model.get_document_topics(corpus_dup_p[i], minimum_probability=0)
    coss_similarity = cossim(vector1, vector2)
    duplicated_posts_features[counter].append(coss_similarity)
    
    counter = counter + 1

In [None]:
from gensim.matutils import cossim

corpusP__ = [dictionary.doc2bow(post) for post in tokenized_n_d_posts]

counter = 0
for i in range(0, len(posts__), 2):
    vector1 = lda_model.get_document_topics(corpusP__[i], minimum_probability=0)
    vector2 = lda_model.get_document_topics(corpusP__[i+1], minimum_probability=0)
    coss_similarity = cossim(vector1, vector2)
    not_duplicated_posts_features[counter].append(coss_similarity)
    
    counter = counter + 1

# Relevance similarity

In [None]:
from rank_bm25 import BM25Okapi

bm25 = BM25Okapi(allTokenizedPosts)

In [None]:
for i in range(len(posts)):
    scores = bm25.get_scores(tokenized_posts[i])
    duplicated_posts_features[i].append(scores[2*i+1])

    scores = bm25.get_scores(tokenized_titles[i])
    duplicated_posts_features[i].append(scores[2*i+1])

    scores = bm25.get_scores(tokenized_tags[i])
    duplicated_posts_features[i].append(scores[2*i+1])

In [None]:
counter = 0
for i in range(0, len(posts__), 2):
    scores = bm25.get_scores(tokenized_n_d_posts[i])
    not_duplicated_posts_features[counter].append(scores[2*i+1])
    
    scores = bm25.get_scores(tokenized_n_d_titles[i])
    not_duplicated_posts_features[counter].append(scores[2*i+1])

    scores = bm25.get_scores(tokenized_n_d_tags[i])
    not_duplicated_posts_features[counter].append(scores[2*i+1])

    counter = counter + 1

# Add labels

In [None]:
for i in duplicated_posts_features:
    i.append(1)

In [None]:
for i in not_duplicated_posts_features:
    i.append(0)

# Write Output with label

In [None]:
import csv
filename = "./drive/MyDrive/duplicated_scores.csv"

with open(filename, 'w') as csvfile:  
    csvwriter = csv.writer(csvfile)  
        
    csvwriter.writerows([[1,2,3,4,5,6,7,8,9,10]] + duplicated_posts_features)

In [None]:
filename = "./drive/MyDrive/not_duplicated_scores.csv"

with open(filename, 'w') as csvfile:  
    csvwriter = csv.writer(csvfile)  
        
    csvwriter.writerows([[1,2,3,4,5,6,7,8,9,10]] + not_duplicated_posts_features)