In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
# Define a function to calculate Jaccard similarity
def sen_jaccard_similarity(s1, s2):
    # Tokenize the strings and remove stop words
    s1 = BeautifulSoup(s1, "lxml").text
    s1 = ''.join(char for char in s1 if ord(char) < 128)
    s2 = BeautifulSoup(s2, "lxml").text
    s2 = ''.join(char for char in s2 if ord(char) < 128)
    
    s1_tokens = set([word.lower() for word in word_tokenize(s1)])
    s2_tokens = set([word.lower() for word in word_tokenize(s2)])
    # Calculate the Jaccard similarity coefficient
    #print(s1_tokens)
    #print(s2_tokens)
    
    return float(len(s1_tokens.intersection(s2_tokens))) / float(len(s1_tokens.union(s2_tokens)))


def tag_jaccard_similarity(s1, s2):
    # Tokenize the strings and remove stop words
    s1 = s1.replace("<","")
    s1 = s1.replace(">"," ").strip()
    
    s2 = s2.replace("<","")
    s2 = s2.replace(">"," ").strip()
    
    s1_tokens = set([word.lower() for word in word_tokenize(s1)])
    s2_tokens = set([word.lower() for word in word_tokenize(s2)])
    # Calculate the Jaccard similarity coefficient
    #print(s1_tokens)
    #print(s2_tokens)
    
    return float(len(s1_tokens.intersection(s2_tokens))) / float(len(s1_tokens.union(s2_tokens)))

def cosine_similarity(s1, s2):
    # Tokenize the strings and remove stop words
    s1 = s1.replace("<","")
    s1 = s1.replace(">"," ").strip()
    
    s2 = s2.replace("<","")
    s2 = s2.replace(">"," ").strip()
    
    s1_tokens = set([word.lower() for word in word_tokenize(s1)])
    s2_tokens = set([word.lower() for word in word_tokenize(s2)])
    
    # form a set containing keywords of both strings
    l1 =[]
    l2 =[]
    rvector = s1_tokens.union(s2_tokens) 
    for w in rvector:
        if w in s1_tokens: 
            l1.append(1) # create a vector
        else: 
            l1.append(0)
        if w in s2_tokens: 
            l2.append(1)
        else: 
            l2.append(0)
            
    c = 0
    # cosine formula 
    for i in range(len(rvector)):
        c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    print("similarity: ", cosine)
    
    return cosine

In [4]:
s1 = "With tools such as cheat engine, you can read values from the RAM of other programs and when I save the key/iv in a variable you can read them too. Is there a way to prevent that? And is there an way to save the decrypted Text instantly in a SecureString? So it is never saved as a plain string?"
s2 = "Safe way to encrypt and decrypt a text? <p>I use the CryptoStream class to encrypt a text. If I want to decrypt it, I have to know the key and the iv so I can get the original text back, but where do I save them, so that they cannot be stolen?</p>"
result = cosine_similarity(s1,s2)

similarity:  0.3730019232961255


In [118]:
df_result = pd.read_csv("../StackExchange/Stackoverflow/cluster/result/test1_result1.csv")

In [119]:
df_result.head()

Unnamed: 0,Id,sentence,tag,label,data_group,y_pred,compare
0,14523598,AutoDeploy a WAR file found in a subfolder of ...,<tomcat><war><autodeploy>,50591,2,1258,0
1,14533903,MVC ORM and user related data <p>Yii provides ...,<php><session><orm><yii><yii-cmodel>,329,2,281019,0
2,1515086,#defined bitflags and enums - peaceful coexist...,<c><enums><macros><bitflags>,155,2,4837,0
3,18756728,Mysql match against unknown column <p>I have a...,<mysql><sql><match><against>,266811,2,98579,0
4,22487446,Object.getOwnPropertyNames() vs Object.prototy...,<javascript><hasownproperty>,16827,2,16827,1


In [120]:
wrong_list = list(df_result[df_result["compare"] == 0].label.values)

In [121]:
len(wrong_list)

52

In [122]:
right_list = list(df_result[df_result["compare"] == 1].label.values)

In [123]:
len(right_list)

74

In [110]:
df = pd.read_csv("../StackExchange/Stackoverflow/cluster_clean.csv")

In [111]:
tags_wrong = [list(df[df["label"] == label_num]["tag"].values) for label_num in wrong_list]
tags_right = [list(df[df["label"] == label_num]["tag"].values) for label_num in right_list]

In [112]:
sim_result = []
for group in tags_wrong:
    total_sim = 0
    count = 0
    for i in range(len(group)):
        for j in range(i+1, len(group)):
            similarity = tag_jaccard_similarity(group[i], group[j])
            total_sim += similarity
            count += 1
    result = total_sim/count
    sim_result.append(result)
    print("Similarity: {}".format(result))

Similarity: 0.08004046446164666
Similarity: 0.043639472806139466
Similarity: 0.0613300492610837
Similarity: 0.13322622796307054
Similarity: 0.09757245069745078
Similarity: 0.09848618459729565
Similarity: 0.05354512385762382
Similarity: 0.17335992907801362
Similarity: 0.05576471968097583
Similarity: 0.14218214417550024
Similarity: 0.08955060198682664
Similarity: 0.15582778631165747
Similarity: 0.11685540069686444
Similarity: 0.10223190089043785
Similarity: 0.14581327871650485
Similarity: 0.12485039042498736
Similarity: 0.10393838538999857
Similarity: 0.01664418641019626
Similarity: 0.21185437137818136
Similarity: 0.24614478114478064
Similarity: 0.1364712408260797
Similarity: 0.06422333453583451
Similarity: 0.09872333389620663
Similarity: 0.20085300786981514
Similarity: 0.1186468385803937
Similarity: 0.19467155067155073
Similarity: 0.3566404722808666
Similarity: 0.18730718355718406
Similarity: 0.058087342079689186
Similarity: 0.03430814354727398
Similarity: 0.10407224958949116
Similarity

In [113]:
sum(sim_result) / len(sim_result)

0.1349811153957326

In [114]:
sim_result_good = []
for group in tags_right:
    total_sim = 0
    count = 0
    for i in range(len(group)):
        for j in range(i+1, len(group)):
            similarity = tag_jaccard_similarity(group[i], group[j])
            total_sim += similarity
            count += 1
    result = total_sim/count
    sim_result_good.append(result)
    print("Similarity: {}".format(result))

Similarity: 0.1497595951961033
Similarity: 0.07676681783824636
Similarity: 0.3954323370990028
Similarity: 0.17083673469387647
Similarity: 0.1852370777370779
Similarity: 0.22394599303135845
Similarity: 0.1913912231559296
Similarity: 0.18994617770479882
Similarity: 0.18572281959378795
Similarity: 0.23285250256016427
Similarity: 0.21924001924001998
Similarity: 0.16255077658303496
Similarity: 0.24341728754994085
Similarity: 0.18743505410172095
Similarity: 0.16250510026372122
Similarity: 0.07983956306536953
Similarity: 0.2108052575443881
Similarity: 0.22074452764977026
Similarity: 0.14608803332207498
Similarity: 0.3172772055530671
Similarity: 0.1503794037940383
Similarity: 0.16792850473522775
Similarity: 0.09663028001898474
Similarity: 0.19515248975934507
Similarity: 0.1835927704415105
Similarity: 0.0234465362372339
Similarity: 0.2380075339314472
Similarity: 0.13412424740010967
Similarity: 0.16090597855303773
Similarity: 0.07261871018248588
Similarity: 0.08192449819433939
Similarity: 0.3058

In [115]:
sum(sim_result_good) / len(sim_result_good)

0.1700622441713211

In [25]:
# Define a list of strings
string_list = ["I love apples", "I love oranges", "I like bananas", "I hate broccoli"]
string_list1 = ["<javascript><string><fromcharcode>", "<android><c++><opencv><java-native-interface><opencv-stitching>", 
               "<java><android><sensors><angle><tilt>", "<java><swing><swingx><fest><jxtreetable>"]

# Loop through each pair of strings and print their Jaccard similarity coefficient
for i in range(len(string_list1)):
    for j in range(i+1, len(string_list1)):
        similarity = tag_jaccard_similarity(string_list1[i], string_list1[j])
        print("Similarity between '{}' and '{}' is: {}".format(string_list1[i], string_list1[j], similarity))

{'fromcharcode', 'javascript', 'string'}
{'opencv-stitching', 'java-native-interface', 'c++', 'opencv', 'android'}
Similarity between '<javascript><string><fromcharcode>' and '<android><c++><opencv><java-native-interface><opencv-stitching>' is: 0.0
{'fromcharcode', 'javascript', 'string'}
{'angle', 'tilt', 'java', 'sensors', 'android'}
Similarity between '<javascript><string><fromcharcode>' and '<java><android><sensors><angle><tilt>' is: 0.0
{'fromcharcode', 'javascript', 'string'}
{'swing', 'fest', 'swingx', 'java', 'jxtreetable'}
Similarity between '<javascript><string><fromcharcode>' and '<java><swing><swingx><fest><jxtreetable>' is: 0.0
{'opencv-stitching', 'java-native-interface', 'c++', 'opencv', 'android'}
{'angle', 'tilt', 'java', 'sensors', 'android'}
Similarity between '<android><c++><opencv><java-native-interface><opencv-stitching>' and '<java><android><sensors><angle><tilt>' is: 0.1111111111111111
{'opencv-stitching', 'java-native-interface', 'c++', 'opencv', 'android'}
{'s

In [124]:
tags_wrong_index = []
for ilist in [list(df[df["label"] == label_num].index.values) for label_num in wrong_list]:
    tags_wrong_index.extend(ilist)

In [126]:
len(tags_wrong_index)

1846

In [128]:
set_category = np.zeros(len(df)).tolist()


In [131]:
for index in tags_wrong_index:
    set_category[index] = 1

In [135]:
df["y_wrong"] = set_category
df.to_csv("../StackExchange/Stackoverflow/cluster/result/dataset_result1.csv",index = None)