In [66]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
def lemmatization(lemmatizer, sentence):
    lem = [lemmatizer.lemmatize(k) for k in sentence]
    lem = set(lem)
    return [k for k in lem]

In [3]:
def remove_stop_words(stopwords_list, sentence):
    return [k for k in sentence if k not in stopwords_list]

In [6]:
def preprocess(point):
    lemmatizer = WordNetLemmatizer()
    stopwords_list = stopwords.words('english')
    point = point.lower()
    words = word_tokenize(point) # Remove punctuations
    new_words = [word for word in words if word.isalnum()] # Getting only alpha numerics
    remove_num = [re.sub('[0-9]', '', i) for i in new_words] # Remove Numbers if any after getting alpha numeric
    remove_num = [i for i in remove_num if len(i) > 0] # Remove empty strings
    lemmatized = lemmatization(lemmatizer, remove_num) # Word Lemmatization
    remove_stop = remove_stop_words(stopwords_list, lemmatized) # remove stop words
    updated_term = ' '.join(remove_stop)
    return updated_term

In [44]:
def preprocess_list(points):
    point_arr = []
    for index, point in points.items():
        new_point = preprocess(point)
        point_arr.append(new_point)
    return point_arr

In [63]:
# Jaccard similarity
def jaccard_sim(point1, point2):
    tkns1 = set(word_tokenize(point1))
    tkns2 = set(word_tokenize(point2))
    intersection = len(tkns1.intersection(tkns2))
    union = len(tkns1) + len(tkns2) - intersection
    sim = intersection / union
    return sim

In [8]:
my_df = pd.read_csv("../project-resources/my_dataset.csv")
my_df.head()

Unnamed: 0,id,website,topic,subtopic,agree/disagree,terms
0,1,khan academy,privacy policy,user information collection,1.0,"When you create an account on the Service, or ..."
1,2,khan academy,privacy policy,user information collection,1.0,"After you register, you may also choose to pro..."
2,3,khan academy,privacy policy,user information collection,1.0,"In addition, we may ask you for personal infor..."
3,4,khan academy,privacy policy,user information collection,1.0,If you decide to register through or otherwise...
4,5,khan academy,privacy policy,user information collection,1.0,You may also have the option of sharing additi...


In [35]:
other_df = pd.read_csv("../project-resources/testing_data.csv")
other_df.head()

Unnamed: 0,id,website,topic,subtopic,term,positive/negative
0,1,chegg.com,privacy policy,user information collection,"When you create a user account, we collect you...",
1,2,chegg.com,privacy policy,user information collection,You may have the opportunity to create a profi...,
2,3,chegg.com,privacy policy,user information collection,"When you register to be a tutor, we will colle...",
3,4,chegg.com,privacy policy,user information collection,We collect personal information from you such ...,
4,5,chegg.com,privacy policy,user information collection,The Services may allow community members to co...,


In [22]:
# Original words
my_df['terms'][0]

'When you create an account on the Service, or communicate with us, we may collect personal information including your name, email address, and birthdate.'

In [24]:
# Preprocessed words
preprocess(my_df['terms'][0])

'account email address name birthdate u create service may communicate information collect personal including'

In [80]:
# Preprocessing my all terms and add them in a new column
my_df['new_terms'] = preprocess_list(my_df['terms'])
my_df['new_terms']

0      account email address name birthdate u create ...
1      register demographic provide profile location ...
2      email otherwise user send complete ask u suppo...
3      register single otherwise account decide alrea...
4      sharing option service may additional integrat...
                             ...                        
144    incorrect delete reside hold copy provide rest...
145    remain residual email period every provide ans...
146    obligation email need comply provide request i...
147    delete longer data provide necessary ask u ser...
148    account data ask u certain case inaccurate par...
Name: new_terms, Length: 149, dtype: object

In [84]:
# Preprocessing other all terms and add them in a new column
other_df['new_terms'] = preprocess_list(other_df['term'])
other_df['new_terms'].head()

0    account email address user school code prefere...
1    photograph history work academic opportunity i...
2    register tutor obligation account country need...
3    register email address otherwise customer job ...
4    communication allow text community collaborate...
Name: new_terms, dtype: object

In [68]:
# Cosine similarity matrix
my_points = my_df['new_terms']
other_points = other_df['new_terms']

# Train tfidf on my_points
tfidf = TfidfVectorizer()
tfidf_my_points = tfidf.fit_transform(my_points)

# Tranform the corpus using the trained tfidf
tfidf_other_points = tfidf.transform(other_points)

# Cosine similarity (similarity matrix)
cos_similarity = np.dot(tfidf_other_points, tfidf_my_points.T).A
cos_similarity

array([[0.45983951, 0.10845019, 0.13311983, ..., 0.09841881, 0.10630498,
        0.10397155],
       [0.18216179, 0.09680598, 0.0241785 , ..., 0.07283323, 0.        ,
        0.        ],
       [0.0847251 , 0.06798167, 0.00628415, ..., 0.26312968, 0.063923  ,
        0.03655197],
       ...,
       [0.03366364, 0.18603621, 0.02511572, ..., 0.05072395, 0.        ,
        0.03892878],
       [0.18020835, 0.02655309, 0.05920861, ..., 0.04838494, 0.        ,
        0.        ],
       [0.04177613, 0.01522503, 0.01241826, ..., 0.13832135, 0.        ,
        0.        ]])

In [67]:
jaccard_sim(my_df['new_terms'][0], other_df['new_terms'][0])

0.32142857142857145

In [86]:
for my in my_df['new_terms']:
    for other in other_df['new_terms']:
        sim = jaccard_sim(my, other)
        if sim > 0.3:
            print(sim)

0.32142857142857145
0.3333333333333333
0.3333333333333333
