In [1]:
SEED = 1

import pandas as pd
import sklearn

PATH_SENTENCE_TEXT = r"../dataset/concat_sentence_text.csv"
PATH_SEGMENT_TEXT = r"../dataset/concat_segment_text.csv"

In [2]:
df_segments = pd.read_csv(PATH_SEGMENT_TEXT)
df_segments.head()

Unnamed: 0,segment_text,practice,modality
0,PRIVACY POLICY This privacy policy (hereafter ...,,
1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,,
2,2. THE INFORMATION WE COLLECT The information ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
3,2. THE INFORMATION WE COLLECT The information ...,Identifier_IP_Address_1stParty,PERFORMED
4,"2.2 In addition, we store certain information ...",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED


#### Cut df down to top 5 practices - probably less noise because not all the counts per practice are equal.

In [3]:
df_segments["practice"].value_counts()[:5]

Contact_E_Mail_Address_1stParty               1105
Identifier_Cookie_or_similar_Tech_1stParty     858
Location_1stParty                              821
Identifier_IP_Address_1stParty                 590
Contact_Phone_Number_1stParty                  565
Name: practice, dtype: int64

In [4]:
top_5_cats = ["Contact_E_Mail_Address_1stParty", "Identifier_Cookie_or_similar_Tech_1stParty", "Location_1stParty", "Identifier_IP_Address_1stParty", "Contact_Phone_Number_1stParty"]

In [5]:
df_segments = df_segments[df_segments["practice"].isin(top_5_cats)]

In [6]:
print(sum(df_segments["practice"].value_counts()[:5]))
print(len(df_segments))

3939
3939


### There are NaNs in the data
Because some segments do not have an associated privacy practice. 
Tried fill na with "none", but performance was not that different.

In [3]:
# df_segments = df_segments.dropna()
# df_segments.head()

Unnamed: 0,segment_text,practice,modality
2,2. THE INFORMATION WE COLLECT The information ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
3,2. THE INFORMATION WE COLLECT The information ...,Identifier_IP_Address_1stParty,PERFORMED
4,"2.2 In addition, we store certain information ...",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
8,2.3 6677g may also use ad network providers to...,Identifier_Cookie_or_similar_Tech_3rdParty,PERFORMED
10,2.5 6677g may share demographic information (c...,Demographic_3rdParty,PERFORMED


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'}, ngram_range=(1,4), strip_accents='ascii', binary = True)
tfidf_vectors = vectorizer.fit_transform(df_segments["segment_text"])

In [9]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vectors, df_segments["practice"], test_size = 0.2, random_state = SEED)

## Testing with logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
logistic_clf = LogisticRegression(random_state = SEED, max_iter = 500, n_jobs = -1, multi_class = "ovr").fit(x_train, y_train)
y_pred = logistic_clf.predict(x_test)

In [12]:
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

           Contact_E_Mail_Address_1stParty       0.44      0.73      0.55       199
             Contact_Phone_Number_1stParty       0.30      0.09      0.14       118
Identifier_Cookie_or_similar_Tech_1stParty       0.72      0.78      0.75       169
            Identifier_IP_Address_1stParty       0.54      0.32      0.40       131
                         Location_1stParty       0.56      0.52      0.54       171

                                  accuracy                           0.53       788
                                 macro avg       0.51      0.49      0.48       788
                              weighted avg       0.52      0.53      0.50       788



## Testing with SGDClassifier

In [13]:
from sklearn.linear_model import SGDClassifier

In [14]:
clf_sgdclassifier = SGDClassifier(loss = "hinge", max_iter = 5000, random_state=SEED, n_jobs = -1).fit(x_train, y_train)
y_pred = clf_sgdclassifier.predict(x_test)

In [15]:
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

           Contact_E_Mail_Address_1stParty       0.44      0.58      0.50       199
             Contact_Phone_Number_1stParty       0.25      0.19      0.21       118
Identifier_Cookie_or_similar_Tech_1stParty       0.73      0.79      0.76       169
            Identifier_IP_Address_1stParty       0.48      0.32      0.38       131
                         Location_1stParty       0.57      0.57      0.57       171

                                  accuracy                           0.52       788
                                 macro avg       0.50      0.49      0.49       788
                              weighted avg       0.51      0.52      0.51       788

