#### SVM with Leave One Document Out Cross Validation

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)
import pandas as pd
import numpy as np
import re
import string
import os
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

In [2]:
files = os.listdir("./data/Sentences/")
prefix_sentence = "./data/Sentences/"
prefix_label = "./data/Labels/"

clauses = []
for file in files:
    sentence_file_path = prefix_sentence + file 
    label_file_path = prefix_label + file
    sentences_df = pd.read_csv(sentence_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentences"]
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df], axis=1)
    clauses.append(df_concat)

In [3]:
colnames = ["sentences", "label_converted", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in clauses:
    clauses_df = clauses_df.append(df)

In [4]:
clauses_df.rename(columns={'label_converted': 'label', 'sentences': 'sentences', 'document' : 'document'}, inplace=True)

In [5]:
clauses_df

Unnamed: 0,sentences,label,document
0,thanks for sending us good vibes by using the ...,0,Viber.txt
1,"you may be surprised , but we will refer to al...",0,Viber.txt
2,"the terms of use -lrb- or , the `` terms '' -r...",0,Viber.txt
3,the language of the terms will seem legal -lrb...,0,Viber.txt
4,"when you use our services , in addition to enj...",1,Viber.txt
...,...,...,...
142,the failure of onavo to enforce any right or p...,0,Onavo.txt
143,the section headings in the agreement are incl...,0,Onavo.txt
144,"`` including '' , whether capitalized or not ,...",0,Onavo.txt
145,this agreement may not be assigned by you with...,0,Onavo.txt


In [6]:
assert (clauses_df.isnull().sum().all() == 0)

In [7]:
# perform lemmatization
# not needed 

wnl = WordNetLemmatizer()
clauses_df['sentences'] = clauses_df['sentences']\
.apply(lambda x: " ".join([wnl.lemmatize(w) 
                           for w in nltk.word_tokenize(x)]))

In [8]:
clauses_df.document.unique()

array(['Viber.txt', 'Nintendo.txt', 'Tinder.txt', 'Dropbox.txt',
       'Microsoft.txt', 'Betterpoints_UK.txt', 'Airbnb.txt',
       'musically.txt', 'Crowdtangle.txt', 'TripAdvisor.txt',
       'Deliveroo.txt', 'Moves-app.txt', 'Spotify.txt', 'Supercell.txt',
       '9gag.txt', 'Booking.txt', 'Headspace.txt', 'Fitbit.txt',
       'Syncme.txt', 'Vimeo.txt', 'Oculus.txt', 'Endomondo.txt',
       'Instagram.txt', 'LindenLab.txt', 'WorldOfWarcraft.txt',
       'YouTube.txt', 'Academia.txt', 'Yahoo.txt', 'WhatsApp.txt',
       'Google.txt', 'Zynga.txt', 'Facebook.txt', 'Amazon.txt',
       'Vivino.txt', 'Netflix.txt', 'PokemonGo.txt', 'Skype.txt',
       'Snap.txt', 'eBay.txt', 'Masquerade.txt', 'Twitter.txt',
       'LinkedIn.txt', 'Skyscanner.txt', 'Duolingo.txt', 'TrueCaller.txt',
       'Uber.txt', 'Rovio.txt', 'Atlas.txt', 'Evernote.txt', 'Onavo.txt'],
      dtype=object)

In [9]:
logo = LeaveOneGroupOut()
X = clauses_df['sentences']
y = clauses_df['label']
group = clauses_df['document']
logo.get_n_splits(X, y, group)

50

In [10]:
train_val_test = []
for train_val_index, test_index in logo.split(X, y, group):
    train_val, test = clauses_df.iloc[train_val_index], clauses_df.iloc[test_index]
    train_val_test.append((train_val, test))

In [11]:
# TF - IDF extraction
scores = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1_macro', refit = True)
    clf.fit(X_train, y_train, groups = train_groups)
    score = clf.score(X_test, y_test)
    scores.append(score)

In [12]:
average_test_f1_score = sum(scores)/len(scores)
print("Average F1 score from Leave one out document : " + str(average_test_f1_score))

Average F1 score from Leave one out document : 0.8438240413629698


### Contractions and stop words

In [13]:
clauses_copy = clauses_df.copy()
# contractions 
def handle_contractions(sentence):
    sentence = re.sub(r"won\'t", "will not", sentence)
    sentence = re.sub(r"can\'t", "can not", sentence)
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

# stopwords
def remove_stopwords(sentence):
    filtered_words = [word for word in sentence.split() if word not in stopwords.words('english')]
    return " ".join(filtered_words)

processed_clauses = clauses_copy.copy()
processed_clauses['sentences'] = clauses_df['sentences'].apply(handle_contractions)
#processed_clauses['sentences'] = clauses_df['sentences'].apply(remove_stopwords)

In [14]:
logo = LeaveOneGroupOut()
X = processed_clauses['sentences']
y = processed_clauses['label']
group = processed_clauses['document']
logo.get_n_splits(X, y, group)

50

In [15]:
train_val_test = []
for train_val_index, test_index in logo.split(X, y, group):
    train_val, test = processed_clauses.iloc[train_val_index], processed_clauses.iloc[test_index]
    train_val_test.append((train_val, test))

In [16]:
# TF - IDF extraction
scores = []
for batch in train_val_test:
    X_train = batch[0]["sentences"]
    y_train = batch[0]["label"]
    train_groups = batch[0]["document"]
    X_test = batch[1]["sentences"]
    y_test = batch[1]["label"]
    
    test_document = batch[1].document.unique()[0]
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')
    
    svm = LinearSVC(random_state=0, max_iter = 5000)
    Cs = [0.001, 0.01, 0.1, 1, 10]
    clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1_macro', refit = True)
    clf.fit(X_train, y_train, groups = train_groups)
    score = clf.score(X_test, y_test)
    scores.append(score)

In [17]:
average_test_f1_score = sum(scores)/len(scores)
print("Average F1 score from Leave one out document : " + str(average_test_f1_score))

Average F1 score from Leave one out document : 0.8415685757311704


#### Using stop words reduces F1 Score to 81%. Without stop words and contractions, the score obtained is higher.