### 1596303 & 1263176 Textually Dissimilar Data

In [88]:
desc1 = '''STR:

find a .txt file with mimetype text/plain and content-disposition download + filename.txt
download
ER:
default open app is nodepad

AR:
default open app is "Microsoft WinRT Storage API". Whatever that is.

Marking as a regression though I wonder if it's a win10 change.'''

desc2 = '''
This is basically bug 1260483 but for file extension and mime type associations instead of just protocols:

STR:
1. On Windows 8+, install a Universal App (like Excel Mobile)
2. Ensure it is the default handler for .xls files
3. Attempt to open an xls file from, say, http://www.sample-videos.com/xls/Sample-Spreadsheet-10-rows.xls

The "Opening Sample-Spreadsheet-10-rows.xls" dialog recommends we "Open with TWINUI (default)"

Alternatively,
4. Select "Do this automatically for files like this from now on." On the "Opening Sample-Spreadsheet-10-rows.xls" dialog and press "OK"
5. In Firefox, Options > Applications

It says "Use (default)" instead of "Use Excel Mobile (default)
'''

In [89]:
desc1

'STR:\n\nfind a .txt file with mimetype text/plain and content-disposition download + filename.txt\ndownload\nER:\ndefault open app is nodepad\n\nAR:\ndefault open app is "Microsoft WinRT Storage API". Whatever that is.\n\nMarking as a regression though I wonder if it\'s a win10 change.'

In [90]:
desc2

'\nThis is basically bug 1260483 but for file extension and mime type associations instead of just protocols:\n\nSTR:\n1. On Windows 8+, install a Universal App (like Excel Mobile)\n2. Ensure it is the default handler for .xls files\n3. Attempt to open an xls file from, say, http://www.sample-videos.com/xls/Sample-Spreadsheet-10-rows.xls\n\nThe "Opening Sample-Spreadsheet-10-rows.xls" dialog recommends we "Open with TWINUI (default)"\n\nAlternatively,\n4. Select "Do this automatically for files like this from now on." On the "Opening Sample-Spreadsheet-10-rows.xls" dialog and press "OK"\n5. In Firefox, Options > Applications\n\nIt says "Use (default)" instead of "Use Excel Mobile (default)\n'

In [91]:
import re
# as per recommendation from @freylis, compile once only
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

In [92]:
des =  str(desc1)
des = des.replace("\n"," ")
desc1 = cleanhtml(des) 

In [93]:
des2 =  str(desc2)
des2 = des2.replace("\n"," ")
desc2 = cleanhtml(des2) 

In [94]:
desc1

'STR:  find a .txt file with mimetype text/plain and content-disposition download + filename.txt download ER: default open app is nodepad  AR: default open app is "Microsoft WinRT Storage API". Whatever that is.  Marking as a regression though I wonder if it\'s a win10 change.'

In [95]:
desc2

' This is basically bug 1260483 but for file extension and mime type associations instead of just protocols:  STR: 1. On Windows 8+, install a Universal App (like Excel Mobile) 2. Ensure it is the default handler for .xls files 3. Attempt to open an xls file from, say, http://www.sample-videos.com/xls/Sample-Spreadsheet-10-rows.xls  The "Opening Sample-Spreadsheet-10-rows.xls" dialog recommends we "Open with TWINUI (default)"  Alternatively, 4. Select "Do this automatically for files like this from now on." On the "Opening Sample-Spreadsheet-10-rows.xls" dialog and press "OK" 5. In Firefox, Options > Applications  It says "Use (default)" instead of "Use Excel Mobile (default) '

In [96]:
#Text Cleaning round 1 (removing punctutions)
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
#     print(text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\w*\f\w*', ' ', text)
    text = re.sub('\(.*?\)', ' ', text)
    text = re.sub('\[.*]\)', ' ', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+',' ',text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

In [97]:
desc1 = clean_text_round1(desc1)
desc2 = clean_text_round1(desc2)

In [98]:
print("desc1:\n",desc1)
print("desc2:\n",desc2)

desc1:
 str   find a  txt file with mimetype text plain and content disposition download   filename txt download er  default open app is nodepad  ar  default open app is  microsoft winrt storage api   whatever that is   marking as a regression though i wonder if it s a   change 
desc2:
  this is basically bug   but for file extension and mime type associations instead of just protocols   str     on windows     install a universal app      ensure it is the default handler for  xls files    attempt to open an xls file from  say    videos com xls sample spreadsheet   rows xls  the  opening sample spreadsheet   rows xls  dialog recommends we  open with twinui     alternatively     select  do this automatically for files like this from now on   on the  opening sample spreadsheet   rows xls  dialog and press  ok     in firefox  options   applications  it says  use    instead of  use excel mobile   


In [99]:
# Apply a second round of cleaning (removing punctuations)
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    return text

In [100]:
desc1 = clean_text_round2(desc1)
desc2 = clean_text_round2(desc2)

In [101]:
print("desc1:\n",desc1)
print("desc2:\n",desc2)

desc1:
 str   find a  txt file with mimetype text plain and content disposition download   filename txt download er  default open app is nodepad  ar  default open app is  microsoft winrt storage api   whatever that is   marking as a regression though i wonder if it s a   change 
desc2:
  this is basically bug   but for file extension and mime type associations instead of just protocols   str     on windows     install a universal app      ensure it is the default handler for  xls files    attempt to open an xls file from  say    videos com xls sample spreadsheet   rows xls  the  opening sample spreadsheet   rows xls  dialog recommends we  open with twinui     alternatively     select  do this automatically for files like this from now on   on the  opening sample spreadsheet   rows xls  dialog and press  ok     in firefox  options   applications  it says  use    instead of  use excel mobile   


In [102]:
import nltk

In [103]:
sent_text1 = nltk.sent_tokenize(desc1)

In [104]:
for sentence in sent_text1:
    tokenized_text1 = nltk.word_tokenize(sentence)
    print(tokenized_text1)

['str', 'find', 'a', 'txt', 'file', 'with', 'mimetype', 'text', 'plain', 'and', 'content', 'disposition', 'download', 'filename', 'txt', 'download', 'er', 'default', 'open', 'app', 'is', 'nodepad', 'ar', 'default', 'open', 'app', 'is', 'microsoft', 'winrt', 'storage', 'api', 'whatever', 'that', 'is', 'marking', 'as', 'a', 'regression', 'though', 'i', 'wonder', 'if', 'it', 's', 'a', 'change']


In [105]:
sent_text2 = nltk.sent_tokenize(desc2)

In [106]:
for sentence in sent_text2:
    tokenized_text2 = nltk.word_tokenize(sentence)
    print(tokenized_text2)

['this', 'is', 'basically', 'bug', 'but', 'for', 'file', 'extension', 'and', 'mime', 'type', 'associations', 'instead', 'of', 'just', 'protocols', 'str', 'on', 'windows', 'install', 'a', 'universal', 'app', 'ensure', 'it', 'is', 'the', 'default', 'handler', 'for', 'xls', 'files', 'attempt', 'to', 'open', 'an', 'xls', 'file', 'from', 'say', 'videos', 'com', 'xls', 'sample', 'spreadsheet', 'rows', 'xls', 'the', 'opening', 'sample', 'spreadsheet', 'rows', 'xls', 'dialog', 'recommends', 'we', 'open', 'with', 'twinui', 'alternatively', 'select', 'do', 'this', 'automatically', 'for', 'files', 'like', 'this', 'from', 'now', 'on', 'on', 'the', 'opening', 'sample', 'spreadsheet', 'rows', 'xls', 'dialog', 'and', 'press', 'ok', 'in', 'firefox', 'options', 'applications', 'it', 'says', 'use', 'instead', 'of', 'use', 'excel', 'mobile']


In [107]:
common_elements = list(set(tokenized_text1).intersection(set(tokenized_text2)))

In [108]:
len(common_elements)

10