In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import requests
from readability import Document
import re
from unicodedata import normalize

In [11]:
def key_terms(url):
    response = requests.get(url)
    doc = Document(response.text)
    full_text = doc.summary(html_partial=True)
    full_text = full_text.replace(r"\n", " ")
    full_text = full_text.replace(r"\t", " ")
    full_text = full_text.replace(r"/", " ")
    full_text = full_text.replace(r"<p>", " ")
    full_text = normalize('NFKD', full_text)
    full_text = full_text.split('< p>')
    
    TAG_RE = re.compile(r'<[^>][^>]+>')
    
    def remove_tags(text):
        return TAG_RE.sub(' ', text)
    
    term_text = list(map(remove_tags, full_text))
    term_frame = pd.DataFrame(np.array(term_text), columns = ['quoteText'])
    
    tokenizer = RegexpTokenizer(r'\w+')
    term_frame['tokenized'] = term_frame['quoteText'].apply(tokenizer.tokenize)
    
    def text_to_words(titletext):
        letters_only = re.sub("[^a-zA-Z]", " ", titletext)
        words = letters_only.lower().split()
        lemmatizer = WordNetLemmatizer()
        tokens_lem = [lemmatizer.lemmatize(i) for i in words]
        return(' '.join(tokens_lem))
    
    
    lemm_text=[]
    for text in term_frame['quoteText']:
        lemm_text.append(text_to_words(text))
        
        return lemm_text

In [15]:
key_terms('https://www.whatsapp.com/legal?doc=terms-of-service&version=20120707')

['your acceptance this is an agreement between whatsapp inc a california corporation whatsapp the owner and operator of www whatsapp com the whatsapp site the whatsapp software including whatsapp messenger collectively including all content provided by whatsapp through whatsapp messenger and the whatsapp site the whatsapp service or the service and you you or you a user of the service by using the service you acknowledge and agree to these term of service and whatsapp s privacy policy which can be found at http www whatsapp com legal privacy and which are incorporated herein by reference if you choose to not agree with any of these term you may not use the service']

In [92]:
df = pd.read_csv('../webapp/revised_rating_data')

In [93]:
df

Unnamed: 0,id,needModeration,services,title,quoteText,topics,case,tokenized,lemmatized,point_bad,point_non-bad
0,5309,0.0,10minutemail,IP addresses of website visitors are not tracked,10MinuteMail com does NOT keep logs or records...,Anonymity and Tracking,IP addresses of website visitors are not tracked,"['10MinuteMail', 'com', 'does', 'NOT', 'keep',...",minutemail com doe not keep log or record of y...,0,1
1,5310,0.0,10minutemail,This service only uses temporary session cookies,A temporary cookie is used to allow the servic...,Cookies,This service only uses temporary session cookies,"['A', 'temporary', 'cookie', 'is', 'used', 'to...",a temporary cookie is used to allow the servic...,0,1
2,6723,0.0,1password,"Users should revisit the terms periodically, a...","At our discretion, we may make changes to this...",Changes,"Users should revisit the terms periodically, a...","['At', 'our', 'discretion', 'we', 'may', 'make...",at our discretion we may make change to this p...,0,1
3,6725,1.0,1password,The service is provided 'as is' and to be used...,Your use of the Service is at your sole risk ...,User information,The service is provided 'as is' and to be used...,"['Your', 'use', 'of', 'the', 'Service', 'is', ...",your use of the service is at your sole risk t...,1,0
4,6726,1.0,1password,The service provider makes no warranty regardi...,"AgileBits, Inc its subsidiaries, affiliates, ...",Guarantee,The service provider makes no warranty regardi...,"['AgileBits', 'Inc', 'its', 'subsidiaries', 'a...",agilebits inc it subsidiary affiliate and it l...,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2398,7170,1.0,zenimaxmediainc-,This service ignores the Do Not Track (DNT) he...,"Do Not Track Requests Currently, our systems...",Anonymity and Tracking,This service ignores the Do Not Track (DNT) he...,"['Do', 'Not', 'Track', 'Requests', 'Currently'...",do not track request currently our system do n...,1,0
2399,7171,1.0,zenimaxmediainc-,The service allows you to opt out of providing...,Custom Audiences and Matching Unless you ha...,User choice,The service allows you to opt out of providing...,"['Custom', 'Audiences', 'and', 'Matching', 'Un...",custom audience and matching unless you have o...,0,1
2400,7760,1.0,zoosk,This service is only available to users of a c...,AGE RESTRICTIONS Although our website is a gen...,Governance,This service is only available to users of a c...,"['AGE', 'RESTRICTIONS', 'Although', 'our', 'we...",age restriction although our website is a gene...,0,1
2401,7761,1.0,zoosk,This service ignores the Do Not Track (DNT) he...,We do not respond to do not track signals set ...,Anonymity and Tracking,This service ignores the Do Not Track (DNT) he...,"['We', 'do', 'not', 'respond', 'to', 'do', 'no...",we do not respond to do not track signal set b...,1,0


In [104]:
lemmatized = df['lemmatized'].tolist()

In [151]:
X = df['lemmatized']
y = df['point_non-bad']

In [152]:
X.shape

(2403,)

In [153]:
y.shape

(2403,)

In [154]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [155]:
len(X_train)

1802

In [156]:
y_train.shape

(1802,)

In [157]:
y.value_counts(normalize=True)

0    0.516438
1    0.483562
Name: point_non-bad, dtype: float64

In [163]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(X_train.values.astype('U'))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [165]:
X_train = tvec.transform(X_train.values.astype('U'))

In [166]:
lr = LogisticRegression()

In [168]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [172]:
url = "https://www.whatsapp.com/legal?doc=terms-of-service&version=20120707"
response = requests.get(url)
doc = Document(response.text)
full_text = doc.summary(html_partial=True)

In [175]:
full_text

'<div><div class="_-pk rich-text"><h3>1. Your Acceptance</h3>\n<p>This is an agreement between WhatsApp Inc., a California corporation ("WhatsApp"), the owner and operator of <a href="http://www.whatsapp.com/" data-lnfb-mode="ie">www.whatsapp.com</a> (the "WhatsApp Site"), the WhatsApp software, including WhatsApp Messenger (collectively, including all content provided by WhatsApp through WhatsApp Messenger and the WhatsApp Site, the "WhatsApp Service", or the "Service"), and you ("you" or "You"), a user of the Service. BY USING THE SERVICE, YOU ACKNOWLEDGE AND AGREE TO THESE TERMS OF SERVICE, AND WHATSAPP\'S PRIVACY POLICY, WHICH CAN BE FOUND AT <a class="_36or" href="http://www.whatsapp.com/legal/#Privacy" data-lnfb-mode="ie"/><a href="http://www.whatsapp.com/legal/#Privacy" data-lnfb-mode="ie">http://www.whatsapp.com/legal/#Privacy</a>, AND WHICH ARE INCORPORATED HEREIN BY REFERENCE. If you choose to not agree with any of these terms, you may not use the Service.</p>\n<h3>2. WhatsAp

In [205]:
full_text = full_text.replace(r"\n", " ")
full_text = full_text.replace(r"\t", " ")
full_text = full_text.replace(r"/", " ")
full_text = full_text.replace(r"<p>", " ")
full_text = normalize('NFKD', full_text)
full_text = full_text.split('< p>')
TAG_RE = re.compile(r'<[^>][^>]+>')
    
def remove_tags(text):
    return TAG_RE.sub(' ', text)
    
term_text = list(map(remove_tags, full_text))
term_frame = pd.DataFrame(np.array(term_text), columns = ['quoteText'])
    
def text_to_words(titletext):
    letters_only = re.sub("[^a-zA-Z]", " ", titletext)
    words = letters_only.lower().split()
    lemmatizer = WordNetLemmatizer()
    tokens_lem = [lemmatizer.lemmatize(i) for i in words]
    return(' '.join(tokens_lem))
    
lemm_text=[]
for text in data['quoteText']:
    lemm_text.append(text_to_words(text))

        

In [208]:
lemm_text = text_cleaning(full_text)
lemm_text

['your acceptance this is an agreement between whatsapp inc a california corporation whatsapp the owner and operator of www whatsapp com the whatsapp site the whatsapp software including whatsapp messenger collectively including all content provided by whatsapp through whatsapp messenger and the whatsapp site the whatsapp service or the service and you you or you a user of the service by using the service you acknowledge and agree to these term of service and whatsapp s privacy policy which can be found at http www whatsapp com legal privacy and which are incorporated herein by reference if you choose to not agree with any of these term you may not use the service',
 'whatsapp service these term of service apply to all user of the whatsapp service information provided by our user through the whatsapp service may contain link to third party website that are not owned or controlled by whatsapp whatsapp ha no control over and assumes no responsibility for the content privacy policy or pra

In [219]:
term_frame = pd.DataFrame(np.array(lemm_text), columns = ['lemm_text'])
term_frame

Unnamed: 0,lemm_text
0,your acceptance this is an agreement between w...
1,whatsapp service these term of service apply t...
2,whatsapp access a subject to your compliance w...
3,b in order to access and use the feature of th...
4,c you agree not to use or launch any automated...
5,intellectual property right the design of the ...
6,user status submission a the whatsapp service ...
7,b you shall be solely responsible for your own...
8,c in connection with status submission you fur...
9,d adult content must be identified a such what...


In [271]:
vect = tvec.transform(lemm_text).toarray()
prediction = pd.DataFrame(lr.predict_proba(vect), columns=['warning','non-warning'])
results = pd.merge(term_frame, prediction, left_index=True, right_index=True)
my_prediction = results["warning"].mean()

In [273]:
my_prediction

0.43443139778469436