In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

PATH_SENTENCE_TEXT = r"../dataset/concat_sentence_text.csv"
PATH_SEGMENT_TEXT = r"../dataset/concat_segment_text.csv"

In [2]:
# Data restructuring
df = pd.read_csv(PATH_SEGMENT_TEXT)
df_value_counts = df["practice"].value_counts().rename_axis("practice").reset_index(name = "counts")
df = df.fillna("None")

top_5_cats = list(df_value_counts["practice"][:5])
df = df[df["practice"].isin(top_5_cats)]

df.head()

Unnamed: 0,segment_text,practice,modality
2,2. THE INFORMATION WE COLLECT The information ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
3,2. THE INFORMATION WE COLLECT The information ...,Identifier_IP_Address_1stParty,PERFORMED
4,"2.2 In addition, we store certain information ...",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
12,2.6 Most browsers are initially set to accept ...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
13,"B) PERSONAL DATA, INCLUDING YOUR E-MAIL ADDRES...",Contact_E_Mail_Address_1stParty,PERFORMED


### Tuning logistic with tfidf

In [3]:
vectorizer = TfidfVectorizer(stop_words={'english'}, strip_accents='ascii', ngram_range = (1, 4))
logistic = LogisticRegression(random_state = 1, max_iter = 500, n_jobs = -1, multi_class = "ovr")

In [4]:
x_tf = vectorizer.fit_transform(df["segment_text"])
x_train, x_test, y_train, y_test = train_test_split(x_tf, df["practice"], test_size=0.2, random_state=0)

In [5]:
param_grid = {
    "solver" : ["liblinear", "lbfgs", "sag"],
    "penalty" : ["l1", "l2", "elasticnet"],
    "C" : [0.001, 0.01, 0.1, 1]
}

In [6]:
grid_search = GridSearchCV(logistic, param_grid, scoring = "roc_auc", cv = 3, verbose = 1, n_jobs=-1)

In [7]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


60 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Tristan\Desktop\Capstone ML and Data privacy\capstone-repo\capstone\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Tristan\Desktop\Capstone ML and Data privacy\capstone-repo\capstone\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Tristan\Desktop\Capstone ML and Data privacy\capstone-repo\capstone\lib\site-packages\sklearn\linear_model\_logistic.py", li

GridSearchCV(cv=3,
             estimator=LogisticRegression(max_iter=500, multi_class='ovr',
                                          n_jobs=-1, random_state=1),
             n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['liblinear', 'lbfgs', 'sag']},
             scoring='roc_auc', verbose=1)

In [8]:
grid_search.best_estimator_

LogisticRegression(C=0.001, max_iter=500, multi_class='ovr', n_jobs=-1,
                   penalty='l1', random_state=1, solver='liblinear')

In [13]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.104596,0.078076,0.006999,0.000817,0.001,l1,liblinear,"{'C': 0.001, 'penalty': 'l1', 'solver': 'libli...",,,,,,1
1,0.030547,0.002316,0.0,0.0,0.001,l1,lbfgs,"{'C': 0.001, 'penalty': 'l1', 'solver': 'lbfgs'}",,,,,,20
2,0.037703,0.008991,0.0,0.0,0.001,l1,sag,"{'C': 0.001, 'penalty': 'l1', 'solver': 'sag'}",,,,,,21
3,7.937775,3.788354,0.019649,0.018618,0.001,l2,liblinear,"{'C': 0.001, 'penalty': 'l2', 'solver': 'libli...",,,,,,22
4,42.653777,0.700033,0.007001,0.001641,0.001,l2,lbfgs,"{'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}",,,,,,23
5,33.284264,1.089195,0.034998,0.038263,0.001,l2,sag,"{'C': 0.001, 'penalty': 'l2', 'solver': 'sag'}",,,,,,24
6,0.058994,0.031495,0.0,0.0,0.001,elasticnet,liblinear,"{'C': 0.001, 'penalty': 'elasticnet', 'solver'...",,,,,,25
7,0.027327,0.008265,0.0,0.0,0.001,elasticnet,lbfgs,"{'C': 0.001, 'penalty': 'elasticnet', 'solver'...",,,,,,26
8,0.033663,0.018659,0.0,0.0,0.001,elasticnet,sag,"{'C': 0.001, 'penalty': 'elasticnet', 'solver'...",,,,,,27
9,14.144637,4.206433,0.051004,0.063644,0.01,l1,liblinear,"{'C': 0.01, 'penalty': 'l1', 'solver': 'liblin...",,,,,,28


In [14]:
logistic_default = LogisticRegression(random_state = 1, max_iter = 500, n_jobs = -1, multi_class = "ovr")

In [16]:
logistic_default = logistic_default.fit(x_train, y_train)
y_pred = logistic_default.predict(x_test)

In [17]:
print(classification_report(y_test, y_pred))

                                            precision    recall  f1-score   support

           Contact_E_Mail_Address_1stParty       0.51      0.74      0.60       224
             Contact_Phone_Number_1stParty       0.24      0.11      0.15       121
Identifier_Cookie_or_similar_Tech_1stParty       0.78      0.85      0.81       171
            Identifier_IP_Address_1stParty       0.45      0.36      0.40       108
                         Location_1stParty       0.63      0.53      0.57       164

                                  accuracy                           0.57       788
                                 macro avg       0.52      0.52      0.51       788
                              weighted avg       0.54      0.57      0.55       788



### Tuning SVC with Tfidf

In [3]:
svc = SVC(probability=True)

In [5]:
param_grid = {
    "kernel" : ["linear", "poly", "rbf", "sigmoid"],
    "gamma" : [0.1, 1, 10, 100],
    "C" : [0.1, 1, 10, 100],
    "degree" : range(4, 50)
}

In [7]:
grid_search = RandomizedSearchCV(svc, param_grid, scoring = "roc_auc", cv = 3, verbose = 3, random_state=1, n_jobs=-1)

In [8]:
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
