In [1]:
SEED = 1

import pandas as pd
import sklearn
import seaborn as sns

PATH_SENTENCE_TEXT = r"../dataset/concat_sentence_text.csv"
PATH_SEGMENT_TEXT = r"../dataset/concat_segment_text.csv"

In [2]:
df_segments = pd.read_csv(PATH_SEGMENT_TEXT)
df_segments.head()

Unnamed: 0,segment_text,practice,modality
0,PRIVACY POLICY This privacy policy (hereafter ...,,
1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,,
2,2. THE INFORMATION WE COLLECT The information ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
3,2. THE INFORMATION WE COLLECT The information ...,Identifier_IP_Address_1stParty,PERFORMED
4,"2.2 In addition, we store certain information ...",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED


### There are NaNs in the data
Because some segments do not have an associated privacy practice. 
Tried fill na with "none", but performance was not that different.

In [3]:
df_segments = df_segments.dropna()
df_segments.head()

Unnamed: 0,segment_text,practice,modality
2,2. THE INFORMATION WE COLLECT The information ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
3,2. THE INFORMATION WE COLLECT The information ...,Identifier_IP_Address_1stParty,PERFORMED
4,"2.2 In addition, we store certain information ...",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
8,2.3 6677g may also use ad network providers to...,Identifier_Cookie_or_similar_Tech_3rdParty,PERFORMED
10,2.5 6677g may share demographic information (c...,Demographic_3rdParty,PERFORMED


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'}, ngram_range=(1,2), strip_accents='ascii', binary = True)
tfidf_vectors = vectorizer.fit_transform(df_segments["segment_text"])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vectors, df_segments["practice"], test_size = 0.2, random_state = SEED)

## Testing with logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
logistic_clf = LogisticRegression(random_state = SEED, max_iter = 500, n_jobs = -1, multi_class = "ovr").fit(x_train, y_train)
y_pred = logistic_clf.predict(x_test)

In [9]:
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

                          Contact_1stParty       0.11      0.03      0.04        37
                          Contact_3rdParty       0.00      0.00      0.00         5
             Contact_Address_Book_1stParty       0.33      0.23      0.27        40
             Contact_Address_Book_3rdParty       0.00      0.00      0.00         3
                     Contact_City_1stParty       0.00      0.00      0.00        20
                     Contact_City_3rdParty       0.00      0.00      0.00         3
           Contact_E_Mail_Address_1stParty       0.25      0.56      0.34       248
           Contact_E_Mail_Address_3rdParty       0.00      0.00      0.00        29
                 Contact_Password_1stParty       0.08      0.02      0.03        46
                 Contact_Password_3rdParty       0.00      0.00      0.00         2
             Contact_Phone_Number_1stParty       0.14      0.16      0.15  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Testing with SGDClassifier

In [10]:
from sklearn.linear_model import SGDClassifier

In [11]:
clf_sgdclassifier = SGDClassifier(loss = "hinge", max_iter = 5000, random_state=SEED, n_jobs = -1).fit(x_train, y_train)
y_pred = clf_sgdclassifier.predict(x_test)

In [12]:
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

                          Contact_1stParty       0.13      0.14      0.13        37
                          Contact_3rdParty       0.00      0.00      0.00         5
             Contact_Address_Book_1stParty       0.30      0.53      0.39        40
             Contact_Address_Book_3rdParty       0.00      0.00      0.00         3
                     Contact_City_1stParty       0.00      0.00      0.00        20
                     Contact_City_3rdParty       0.00      0.00      0.00         3
           Contact_E_Mail_Address_1stParty       0.31      0.33      0.32       248
           Contact_E_Mail_Address_3rdParty       0.07      0.03      0.05        29
                 Contact_Password_1stParty       0.20      0.13      0.16        46
                 Contact_Password_3rdParty       0.00      0.00      0.00         2
             Contact_Phone_Number_1stParty       0.10      0.12      0.11  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
