In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

# Read in Data

In [2]:

labels_data = pd.read_csv('data/labels.csv')

# Read the training.csv file
training_data = pd.read_csv('data/training.csv')
training_data = pd.merge(training_data, labels_data, on='label')

# Read the validations.csv file
validation_data = pd.read_csv('data/validations.csv')
validation_data = pd.merge(validation_data, labels_data, on='label')

In [3]:
training_data

Unnamed: 0,content,label,Type of Clause,Degree of Unfairness
0,"The purpose of this website, 9gag.com (the “Si...",unc,Unknown,0
1,"You agree that neither 9GAG, Inc nor the Site ...",ltd2,Limitation of Liability,2
2,"9GAG, Inc retains the right to create limits o...",ter3,Unilateral Termination,3
3,The Site is protected by copyright as a collec...,unc,Unknown,0
4,"Subscriber may download or copy the Content, a...",unc,Unknown,0
...,...,...,...,...
2988,If you are a player outside of the United Stat...,unc,Unknown,0
2989,Any attempted notice that does not follow thes...,unc,Unknown,0
2990,You agree that given the unique and irreplacea...,unc,Unknown,0
2991,You agree to limit your claims to claims for m...,unc,Unknown,0


## Split the data into X and y

In [4]:
# Prepare the training data
X_train = training_data['content']
y_train = training_data['label']

# Prepare the validation data
X_val = validation_data['content']
y_val = validation_data['label']


# Establish the baseline model

In [5]:

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the validation data using the fitted vectorizer
X_val_vec = vectorizer.transform(X_val)

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train_vec, y_train)

# Predict labels for the validation data
y_pred = model.predict(X_val_vec)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8546739984289081


# Improve the model using Hyper Parameter Tuning

todo: write exlanation

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from nltk.stem import WordNetLemmatizer
import re
from sklearn.base import TransformerMixin
import nltk

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Custom text preprocessor
class TextPreprocessor(TransformerMixin):
    def __init__(self):
        # No need to initialize lemmatizer or stopwords here
        pass

    def transform(self, X, **transform_params):
        # Initialize the lemmatizer and stopwords inside the method
        from nltk.corpus import stopwords
        lemmatizer = WordNetLemmatizer()
        english_stopwords = set(stopwords.words('english'))
        cleaned_docs = []
        for doc in X:
            doc = doc.lower()
            doc = re.sub(r'\W', ' ', doc)
            doc = re.sub(r'\s+', ' ', doc)
            doc = ' '.join([lemmatizer.lemmatize(token) for token in doc.split() if token not in english_stopwords])
            cleaned_docs.append(doc)
        return cleaned_docs

    def fit(self, X, y=None, **fit_params):
        # No changes needed here
        return self

# Pipeline with text preprocessing, TF-IDF vectorization, and an SVM model
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC())
])

# Parameters for GridSearchCV
param_grid = {
    'vectorizer__max_features': [5000, 10000],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# Grid search with n_jobs set to 1 to avoid multiprocessing issues in notebooks
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict labels for the validation data
y_pred = best_model.predict(X_val)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
print("Best Model Accuracy:", accuracy)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyankamarwaha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyankamarwaha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Best Model Accuracy: 0.8986645718774549


### Deep Learning based baseline

##### TODO: Write explanation

In [7]:
import torch
from transformers import BertTokenizer, AdamW
from sklearn.metrics import accuracy_score
from baseline_dl_model import GdprClassifier, create_data_loader, ExtendedLabelEncoder


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Encode labels with the ExtendedLabelEncoder
extended_le = ExtendedLabelEncoder()
y_train_enc = extended_le.fit_transform(y_train)
y_val_enc = extended_le.transform(y_val)

# Adjust this based on the number of unique labels + 1 for 'unknown'
NUM_CLASSES = len(extended_le.classes_)

# Data loaders
BATCH_SIZE = 16
MAX_LEN = 256
train_data_loader = create_data_loader(X_train, y_train_enc, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(X_val, y_val_enc, tokenizer, MAX_LEN, BATCH_SIZE)

# Initialize and train the model
model = GdprClassifier(n_classes=NUM_CLASSES)
model = model.to(device)

# Training parameters
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_fn = torch.nn.CrossEntropyLoss().to(device)



In [8]:
from tqdm import tqdm
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_data_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}')
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        # TODO: Remove break statement
        break

        progress_bar.set_postfix({'Training Loss': loss.item()})
    # TODO: Remove
    break
    avg_train_loss = total_loss / len(train_data_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {avg_train_loss}')



In [None]:
from .baseline_dl_model import accuracy_score

accuracy_score(model, val_data_loader, device)