## Getting Privacy Text

In [1]:
import string
import nltk
import re
import spacy
nlp = spacy.load("en_core_web_sm")

### Text Preprocessing

In [2]:
def remove_punctuation(text):
    no_punctuation_text = ''.join([i for i in str(text) if i not in string.punctuation])
    return no_punctuation_text.lower()

In [3]:
def remove_nonwords(str_):
    return re.sub("[^A-Za-z ]\w+[^A-Za-z]*", ' ', str_)

In [4]:
# Lemmatization and Removing stop words and non words
def text_preprocessing(text):
    text = remove_punctuation(text)
    text = remove_nonwords(text)
    tokenized_text = [token.lemma_ for token in nlp(text)]
    no_stopwords_list = [i.lower() for i in tokenized_text if i not in nlp.Defaults.stop_words]
    lemma_text = ' '.join(no_stopwords_list)
    return lemma_text

### Web scraping

In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
# url of website to scrape
web_url = "https://www.facebook.com/" 

In [7]:
source = requests.get(web_url).text
soup = BeautifulSoup(source, 'lxml')

In [8]:
a_tag = soup.find_all("a") #Gives you the list of all the a tags

In [9]:
urls = []
for i in a_tag:
    for term in ["privacy", "terms","conditions","policy", "legal"]:
        if term in i.text.lower():
            urls.append(i["href"])

raw_policies = []

for url in urls: 
    if url[0] == "/":
        url = web_url + url
    try:
        source = requests.get(url).text
        soup = BeautifulSoup(source, 'lxml')
    except:
        continue
    policies=soup.find('html')
    
    raw_policies.append(policies.text)
                    
    

### Adding spaces between joined words

In [10]:
import wordninja
from nltk.tokenize import word_tokenize

clean_words = []
list_of_words = word_tokenize(". ".join(raw_policies))
for word in list_of_words:
    if len(word) > 10:
        words = wordninja.split(word)
        clean_words.extend(words)
    clean_words.append(word)

In [11]:
clean_text = " ".join(clean_words)

In [12]:
remove_unwanted_punctuations_words = []
for i,word in enumerate(word_tokenize(clean_text)):
    if "." in word:
        if len(word.strip()) > 1:
            word = remove_punctuation(word)
        if word == ".":
            if word_tokenize(clean_text)[i+1].strip()[0] == word_tokenize(clean_text)[i+1].lower()[0]:
                word = " "
    remove_unwanted_punctuations_words.append(word.strip())

In [13]:
clean_text = " ".join(remove_unwanted_punctuations_words)

In [14]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

punkt_param = PunktParameters()
tokenizer = PunktSentenceTokenizer(punkt_param)
clean_policy = tokenizer.tokenize(clean_text)

In [15]:
clean_policy

['Facebook Jump toSections of this page Accessibility pageAccessibility HelpPress alt + / to open this menu Facebook Email menuFacebookEmail or Phone Password Forgot PhonePasswordForgot account ?',
 'Sign UpFull Data Policy Facebook Ads Controls Privacy Basics Cookies Policy Terms More Resources View a printable version of the Data Policy Interactive Interactive Tools Minors and Safety Facebook Privacy Page Facebook Safety Page Facebook Site Governance Page euus Privacy Shield and swissus Privacy Shield Notice Data Policy This policy describes the information information we process to support Facebook , Instagram , Messenger and other products and features offered by Facebook ( Facebook Products or Products ) .',
 'You can find additional tools and information information in the Facebook Settings and Instagram Settings .',
 'Return to top What kinds of information information do we collect ?',
 'To provide the Facebook Products , we must process information information about you .',
 '

### Find all the privacy text similar to good and bad privacy

#### Reading the data

In [16]:
good_privacy = []
with open("./Data/good_privacy.txt", "r", encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        good_privacy.append(text)

In [17]:
bad_privacy = []
with open("./Data/bad_privacy.txt", "r",  encoding="utf8") as f:
    for line in f:
        text = text_preprocessing(line.rstrip().lower())
        bad_privacy.append(text)

In [18]:
all_privacy = good_privacy + bad_privacy

#### Finding similarity score

In [19]:
# Program to measure the similarity between 
# two sentences using cosine similarity.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [20]:
sim_sentences = []
for X_raw in clean_policy:
    X = text_preprocessing(X_raw)
    high_sim = 0
    sim_sentence = ""
    for Y in all_privacy:
        
        l1 =[];l2 =[]

        X_set = {w for w in word_tokenize(X)} 
        Y_set = {w for w in word_tokenize(Y)}

        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        cosine = c / float((sum(l1)*sum(l2))**0.5)
        
        if cosine > high_sim:
            high_sim = cosine
            sim_sentence = X_raw
            
    if high_sim > 0.35:
        sim_sentences.append(sim_sentence.strip())
        


In [21]:
final_sentences = []
for sim_sentence in sim_sentences:
    if "?" not in sim_sentence:
        final_sentences.append(sim_sentence)

In [22]:
final_sentences

['To provide the Facebook Products , we must process information information about you .',
 'The types of information information we collect depend on how you use our Products .',
 'Things you and others do and provide Information provideinformation and content you provide .',
 'We collect the content , communications communications and other information information you provide when you use our Products , including when you sign up for an account , create or share content , and message or communicate communicate with others .',
 'This can include information information in or about the content you provide ( like metadata ) , such as the location of a photo or the date a file was created .',
 'Data with special protections protections : You can choose to provide information information in your Facebook profile fields or Life Events about your religious views , political views , who you are `` interested in , `` or your health .',
 'We collect information information about the people , P