## Importing all the mobile datasets

In [None]:
import pandas as pd
import requests
import json
import urllib.request
import time
from bs4 import BeautifulSoup

In [None]:
data = pd.read_csv("Mobile_data.csv")
data.tail()

In [None]:
result = data.copy()
result

In [None]:
bug_ids = list(result['Bug ID'])
bug_ids[:10]

In [None]:
len(bug_ids)

In [None]:
from datetime import datetime

In [None]:
#fetching the first comment where the description exists using rest API
resp = requests.get('https://bugzilla.mozilla.org/rest/bug/1631754/comment')
resp.json
list(filter(lambda comment: comment['count'] == 0, resp.json()['bugs']['1631754']['comments']))[0]['text']

In [None]:
#Getting description as well as duplicate ids for all the bug ids
resp = None
failed_count = 0
duplicates = []
comments = {}
tick = datetime.now()
total = len(bug_ids)
bug_not_fetched = []
index_not_fetched = []

for idx, val in enumerate(bug_ids):
    bug_url = 'https://bugzilla.mozilla.org/rest/bug/'+str(val)+'/comment'
    resp = requests.get(bug_url)
    try:
        desc = list(filter(lambda comment: comment['count'] == 0, resp.json()['bugs'][str(val)]['comments']))[0]['text']
        comments[val] = desc
        tock = datetime.now()
        delta = tock - tick
        with urllib.request.urlopen("https://bugzilla.mozilla.org/rest/bug/"+str(val)) as url:
            bug_data = json.loads(url.read().decode())
        duplicates.append(bug_data['bugs'][0]['dupe_of'])
        print(f'\r{idx}, Elapsed: {delta}, Remaining: {delta/idx * total}', end='')
    except Exception as e:
        failed_count += 1
        bug_not_fetched.append(val)
        index_not_fetched.append(idx)

In [None]:
failed_count

In [None]:
len(duplicates)

In [None]:
bug_desc = pd.DataFrame(comments.items(), columns=['Bug Id', 'Description'])
bug_desc['Duplicate_Bug_Ids'] = duplicates
bug_desc

In [None]:
final_data = pd.merge(result,bug_desc,left_on='Bug ID',right_on= "Bug Id")
final_data

In [None]:
final_data = final_data.drop(columns=['Assignee', 'Duplicates', 'Number of Duplicates'])
final_data

In [None]:
final_data.to_csv('mobile_dataset.csv') #dataset with description

In [None]:
final_data1= final_data.copy()
final_data1

In [None]:
duplicate_id = list(final_data.Duplicate_Bug_Ids) #storing duplicate ids in duplicate_id
duplicate_id[:50]

In [None]:
dup_list = [x for x in duplicate_id if pd.isnull(x) == False and x != 'nan']
dup_list[:10]

In [None]:
len(dup_list)

In [None]:
final_data2= final_data.copy()
final_data2

In [None]:
#calculating how many duplicate ids are present as the bug ids in the whole dataset in last five years
count_id = 0
dup_id_in_bug_id = []
for i,v in enumerate(dup_list):
    if (v in bug_ids):
        count_id +=1
        dup_id_in_bug_id.append(v)

In [None]:
len(dup_id_in_bug_id) #length of duplicate id which is present in bug ids

In [None]:
dup_id_not_in_bug_id  = [x for x in dup_list if x not in dup_id_in_bug_id ] #length of duplicate id which is not present in bug ids
len(dup_id_not_in_bug_id)

In [None]:
df = pd.read_csv("mobile_dataset.csv") #copying the dataset with description into another dataframe
df.head()

In [None]:
df = df.drop(columns = ["Unnamed: 0"])
df

In [None]:
df.Description

In [None]:
df.info()

### Preproccessing the entire dataset

In [None]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
# Discarding the characters beyond 500 characters in the description - to be done after preprocessing  
for index, row in df.iterrows():
    des =  row.Description
#     print(des)
#     break
    des = des.replace("\n"," ")
    des = cleanhtml(des)
    if(len(des)>=500):
        row.Description = des[:500]
        
#Text Cleaning round 1 (removing punctutions)
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
#     print(text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\w*\f\w*', ' ', text)
    text = re.sub('\(.*?\)', ' ', text)
    text = re.sub('\[.*]\)', ' ', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+',' ',text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

round1 = lambda x: clean_text_round1(x)

df = df.dropna(axis=0, subset=['Description'])
df['Description'] = df['Description'].apply(clean_text_round1)

# Apply a second round of cleaning (removing punctuations)
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

df['Description'] = df['Description'].apply(clean_text_round2)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
for index, row in df.iterrows():
    row.Description = lemmatizer.lemmatize(row.Description)

## Saving the entire preproccessed dataset

In [None]:
df.to_csv("mobile_preproccessed_whole_dataset.csv")

### Only the duplicate dataset

In [None]:
duplicate_dataset_mobile = df[df['Duplicate_Bug_Ids'] > 0]
duplicate_dataset_mobile

In [None]:
duplicate_id = list(duplicate_dataset_mobile.Duplicate_Bug_Ids)
duplicate_id[:10]

In [None]:
dup_list = [int(a) for a in duplicate_id]
dup_list[:10]

In [None]:
bug_ids = list(df["Bug ID"])
bug_ids[:10]

In [None]:
count_id = 0
dup_id_in_bug_id = []
for i,v in enumerate(dup_list):
    if (v in bug_ids):
        count_id +=1
        dup_id_in_bug_id.append(v)

In [None]:
len(dup_id_in_bug_id)

In [None]:
dup_id_not_in_bug_id  = [x for x in dup_list if x not in dup_id_in_bug_id ]

In [None]:
len(dup_id_not_in_bug_id)

In [None]:
dup_id_not_in_bug_id = set(dup_id_not_in_bug_id)

### Fetching the description for the duplicate ids that do not have the description

In [None]:
#Fetching description
resp = None
failed_count_dup = 0
duplicates_dup = []
comments_dup = {}
tick = datetime.now()
total = len(dup_id_not_in_bug_id)
for idx, val in enumerate(dup_id_not_in_bug_id):
    bug_url = 'https://bugzilla.mozilla.org/rest/bug/'+str(val)+'/comment'
    resp = requests.get(bug_url)
    try:
        desc_dup = list(filter(lambda comments_dup: comments_dup['count'] == 0, resp.json()['bugs'][str(val)]['comments']))[0]['text']
        comments_dup[val] = desc_dup
        tock = datetime.now()
        delta = tock - tick
        with urllib.request.urlopen("https://bugzilla.mozilla.org/rest/bug/"+str(val)) as url:
            bug_data = json.loads(url.read().decode())
        duplicates_dup.append(bug_data['bugs'][0]['dupe_of'])
        print(f'\r{idx}, Elapsed: {delta}, Remaining: {delta/idx * total}', end='')
    except Exception as e:
        failed_count_dup += 1

In [None]:
failed_count_dup

In [None]:
dup_desc = pd.DataFrame(comments_dup.items(), columns=['Dup Id', 'Description'])
dup_desc['Dup_Bug_Ids'] = duplicates_dup
dup_desc

In [None]:
resp = None
failed_count_dup = 0
duplicates_dup = []

comments_dup = {}
summary_dup = {}
creation_dup = {}
resolution_dup = {}
status_dup = {}
product_dup = {}
component_dup = {}

tick = datetime.now()
total = len(dup_id_not_in_bug_id)

### Fetching the other features of the duplicate dataset

In [None]:
#Fetching all other features
for idx, val in enumerate(dup_id_not_in_bug_id):
    
    bug_url = 'https://bugzilla.mozilla.org/rest/bug/'+str(val)
    resp = requests.get(bug_url)
    
    try:
        summ_dup = resp.json()['bugs'][0]['summary']
        summary_dup[val] = summ_dup
        create_dup =  resp.json()['bugs'][0]['creation_time']
        creation_dup[val] = create_dup
        
        resolve_dup = resp.json()['bugs'][0]['resolution']
        resolution_dup[val] = resolve_dup
        
        stat_dup = resp.json()['bugs'][0]['status']
        status_dup[val] = stat_dup
        
        prod_dup = resp.json()['bugs'][0]['product']
        product_dup[val] = prod_dup
        
        comp_dup = resp.json()['bugs'][0]['component']
        component_dup[val] = comp_dup
        
        tock = datetime.now()
        delta = tock - tick
        with urllib.request.urlopen("https://bugzilla.mozilla.org/rest/bug/"+str(val)) as url:
            bug_data = json.loads(url.read().decode())
        duplicates_dup.append(bug_data['bugs'][0]['dupe_of'])
        print(f'\r{idx}, Elapsed: {delta}, Remaining: {delta/idx * total}', end='')
    except Exception as e:
        failed_count_dup += 1

In [None]:
for idx, val in enumerate(dup_id_not_in_bug_id):
    
    bug_url = 'https://bugzilla.mozilla.org/rest/bug/'+str(val)+'/comment'
    resp = requests.get(bug_url)
    
    try:
        desc_dup = list(filter(lambda comments_dup: comments_dup['count'] == 0, resp.json()['bugs'][str(val)]['comments']))[0]['text']
        comments_dup[val] = desc_dup
        tock = datetime.now()
        delta = tock - tick
        print(f'\r{idx}, Elapsed: {delta}, Remaining: {delta/idx * total}', end='')
    except Exception as e:
        failed_count_dup += 1

In [None]:
dup_desc = pd.DataFrame(comments_dup.items(), columns=['Dup Id', 'Description'])
dup_desc['Dup_Bug_Ids'] = duplicates_dup[:-1]
dup_desc

In [None]:
dup_summary = pd.DataFrame(summary_dup.items(), columns=['Dup Id', 'Summary'])
# dup_summary['Dup_Bug_Ids'] = duplicates_dup
dup_summary

In [None]:
dup_createdtime = pd.DataFrame(creation_dup.items(), columns=['Dup Id', 'Created_time'])
# dup_createdtime['Dup_Bug_Ids'] = duplicates_dup
dup_createdtime

In [None]:
dup_resolution = pd.DataFrame(resolution_dup.items(), columns=['Dup Id', 'Resolution'])
# dup_resolution['Dup_Bug_Ids'] = duplicates_dup
dup_resolution

In [None]:
dup_status = pd.DataFrame(status_dup.items(), columns=['Dup Id', 'Status'])
# dup_status['Dup_Bug_Ids'] = duplicates_dup
dup_status

In [None]:
dup_product = pd.DataFrame(product_dup.items(), columns=['Dup Id', 'Product'])
# dup_product['Dup_Bug_Ids'] = duplicates_dup
dup_product

In [None]:
dup_component = pd.DataFrame(component_dup.items(), columns=['Dup Id', 'Component'])
# dup_component['Dup_Bug_Ids'] = duplicates_dup
dup_component

In [None]:
data_merged = pd.merge(dup_desc,dup_summary,left_on="Dup Id",right_on= "Dup Id")
data_merged

In [None]:
data_merged = pd.merge(data_merged,dup_createdtime,left_on="Dup Id",right_on= "Dup Id")
data_merged

In [None]:
data_merged = pd.merge(data_merged,dup_resolution,left_on="Dup Id",right_on= "Dup Id")
data_merged

In [None]:
data_merged = pd.merge(data_merged,dup_status,left_on="Dup Id",right_on= "Dup Id")
data_merged

In [None]:
data_merged = pd.merge(data_merged,dup_product,left_on="Dup Id",right_on= "Dup Id")
data_merged

In [None]:
data_merged = pd.merge(data_merged,dup_component,left_on="Dup Id",right_on= "Dup Id")
data_merged

### The final duplicate dataset with all the fields

In [None]:
old_duplicate_data_mobile= data_merged[['Dup Id','Product','Component','Status','Resolution','Summary','Created_time','Description','Dup_Bug_Ids']]
old_duplicate_data_mobile

In [None]:
not_nan_list = [x for x in old_duplicate_data_mobile.Dup_Bug_Ids if pd.isnull(x) == False and x != 'nan']
len(not_nan_list)

### Keeping only the data that have a duplicate i.e the duplicate id is not null

In [None]:
old_duplicate_data_mobile.drop(old_duplicate_data_mobile.index[old_duplicate_data_mobile['Dup_Bug_Ids'] >= 0], inplace=True)
old_duplicate_data_mobile                                                                                           

In [None]:
not_nan_list1 = [x for x in old_duplicate_data_mobile.Dup_Bug_Ids if pd.isnull(x) == False and x != 'nan']
len(not_nan_list1)

In [None]:
old_duplicate_data_mobile =  old_duplicate_data_mobile.rename(columns={"Dup Id": "Bug_Id", "Chnaged": "Changed","Dup_Bug_Ids":"Duplicate_Bug_Ids"})
old_duplicate_data_mobile

In [None]:
duplicate_dataset_mobile =  duplicate_dataset_mobile.rename(columns={"Bug ID": "Bug_Id"})
duplicate_dataset_mobile

In [None]:
duplicate_dataset_mobile.reset_index(drop=True, inplace=True)
duplicate_dataset_mobile

### Preprocessing the duplicate dataset

In [None]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

for index, row in old_duplicate_data_mobile.iterrows():
    des =  row.Description
    des = des.replace("\n"," ")
    des = cleanhtml(des)
    if(len(des)>=500):
        row.Description = des[:500]
        
#Text Cleaning round 1 (removing punctutions)
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
#     print(text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\w*\f\w*', ' ', text)
    text = re.sub('\(.*?\)', ' ', text)
    text = re.sub('\[.*]\)', ' ', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+',' ',text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

round1 = lambda x: clean_text_round1(x)

old_duplicate_data_mobile = old_duplicate_data_mobile.dropna(axis=0, subset=['Description'])
old_duplicate_data_mobile['Description'] = old_duplicate_data_mobile['Description'].apply(clean_text_round1)

# Apply a second round of cleaning (removing punctuations)
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

old_duplicate_data_mobile['Description'] = old_duplicate_data_mobile['Description'].apply(clean_text_round2)

In [None]:
old_duplicate_data_mobile

In [None]:
old_duplicate_data_mobile = old_duplicate_data_mobile.rename(columns={"Created_time": "Changed"})

### The final preproccessed duplicate dataset

In [None]:
final_duplicate_data_merged = pd.concat([old_duplicate_data_mobile, duplicate_dataset_mobile], ignore_index=True)
final_duplicate_data_merged

In [None]:
final_duplicate_data_merged.to_csv("mobile_duplicate_total.csv")

### Reading the whole preproccessed dataset

In [None]:
fd_copy2 = df.copy()
fd_copy2

## Splitting the dataset into training and testing set

In [None]:
import random
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
training_dataset ,test_dataset= sklearn.model_selection.train_test_split(fd_copy2, train_size=0.8, test_size=0.2)

In [None]:
nan_value = float("NaN")
test_dataset.replace("", nan_value, inplace=True)
test_dataset.dropna(subset = ["Description"], inplace=True)

In [None]:
cond = (test_dataset['Description'] == 'NaN')
cond.unique()

In [None]:
test_dataset.to_csv("mobile_testing_dataset.csv",index = None)

In [None]:
training_dataset

In [None]:
training_dataset.to_csv("mobile_training_dataset.csv",index = None)

#### Merging the newly fetched duplicate data with the whole dataset

In [None]:
total_d = pd.concat([old_duplicate_data_mobile, df], ignore_index=True)
total_d

## Running TF-IDF on the duplicate dataset for splitting into textually similar and dissimilar 

In [None]:
fd_dup= final_duplicate_data_merged.copy()
fd_dup

In [None]:
ted = fd_dup['Description']
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

### Computing the cosine similarity of the duplicate dataset

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(ted)

# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

In [None]:
print(len(cosine_sim))
print(cosine_sim.shape)

In [None]:
import numpy as np

In [None]:
avg =np.average(cosine_sim, axis=1)
cosine_sim[1]

In [None]:
avg_df = pd.DataFrame(avg, columns = ['Avg_Sim_Score'])

print(avg_df)
print(type(avg_df))

In [None]:
len(avg_df)

In [None]:
avg_for_sorting = pd.DataFrame(avg_df)

In [None]:
sorted= avg_for_sorting.sort_values(by='Avg_Sim_Score', ascending=False)

In [None]:
from scipy.stats import skew
#testing skewness of distribution
print(skew(avg))
print(skew(avg, bias=False))

In [None]:
print(statistics.median(avg))

In [None]:
import statistics
# Calculate mean values
print(statistics.mean(avg))

In [None]:
max_value = avg.max()
print(max_value)
min_value = avg.min()
print(min_value)

In [None]:
avg_df.values

In [None]:
fd_dup1= final_duplicate_data_merged.copy()
fd_dup1

In [None]:
fd_dup1.insert(loc=7,
          column='Average_Cos_Similarity_Score',
          value=avg_df.values)

In [None]:
fd_dup1

### Textually similar data

In [None]:
score_similar = fd_dup1[fd_dup1['Average_Cos_Similarity_Score'] < 0.05700488919422861]

In [None]:
score_similar

In [None]:
score_similar.to_csv("mobile_textually_similar_duplicates.csv")

### Textually Dissimilar data

In [None]:
score_dissimilar = fd_dup1[fd_dup1['Average_Cos_Similarity_Score'] >= 0.05700488919422861]

In [None]:
score_dissimilar

In [None]:
score_dissimilar.to_csv("mobile_textually_dissimilar_duplicates.csv")