#### SVM with Words and Pos Tags Leave One Document Out Cross Validation

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import re
import string
import os
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score
import nltk
from nltk.corpus import stopwords
import warnings
import contractions
import unicodedata
warnings.filterwarnings("ignore")

##### Use following to merge words in sentences with its respective pos tags

In [2]:
files = os.listdir("./data/Postags/")
prefix_sentences = "./data/Sentences/"
prefix_pos = "./data/Postags/"
prefix_label = "./data/Labels/"


def merge(sentence1, sentence2):
    sentence = ""
    s1splits = sentence1.split()
    s2splits = sentence2.split()
    for w1,w2 in zip(s1splits,s2splits):
        sentence = sentence + w1 + "/" + w2 + " "
    sentence.strip()
    return sentence
    
word_pos = []
for file in files:
    label_file_path = prefix_label + file
    sentences_file_path = prefix_sentences + file
    postag_file_path = prefix_pos + file
    pos_df = pd.read_csv(postag_file_path, sep="dummy_separator", header=None)
    pos_df.columns = ["postag"]
    
    sentences_df = pd.read_csv(sentences_file_path, sep="dummy_separator", header=None)
    sentences_df.columns = ["sentence"]
     
    sentences_df["postag"] = pos_df["postag"]
    sentences_df["merged"] = "dummy text"
    for index, row in sentences_df.iterrows():
        row["merged"] = merge(row['sentence'], row['postag'])
        
    label_df = pd.read_csv(label_file_path, sep=" ", header=None)
    label_df.columns = ["label"]
    label_df["label_converted"] = np.where(label_df["label"] == -1, 0, 1)
    sentences_df["document"] = file
    df_concat = pd.concat([label_df["label_converted"], sentences_df[["merged", "document"]]], axis=1)
    word_pos.append(df_concat)

In [3]:
colnames = ["label_converted", "merged", "document"]
clauses_df = pd.DataFrame(columns = colnames)
for df in word_pos:
    clauses_df = clauses_df.append(df)

In [4]:
clauses_df.rename(columns={'label_converted': 'label', 'merged': 'sentences', 'document' : 'document'}, inplace=True)
clauses_df.to_csv("data/word_pos_merged.csv", index = False)

In [5]:
clauses_df

Unnamed: 0,label,sentences,document
0,0,thanks/NNS for/IN sending/VBG us/PRP good/JJ v...,Viber.txt
1,0,"you/PRP may/MD be/VB surprised/VBN ,/, but/CC ...",Viber.txt
2,0,the/DT terms/NNS of/IN use/NN -lrb-/-LRB- or/C...,Viber.txt
3,0,the/DT language/NN of/IN the/DT terms/NNS will...,Viber.txt
4,1,when/WRB you/PRP use/VBP our/PRP$ services/NNP...,Viber.txt
...,...,...,...
142,0,the/DT failure/NN of/IN onavo/NNP to/TO enforc...,Onavo.txt
143,0,the/DT section/NN headings/NNS in/IN the/DT ag...,Onavo.txt
144,0,"``/`` including/VBG ''/'' ,/, whether/IN capit...",Onavo.txt
145,0,this/DT agreement/NN may/MD not/RB be/VB assig...,Onavo.txt


In [6]:
assert (clauses_df.isnull().sum().all() == 0)

In [7]:
def to_lower(data: pd.Series):
    return data.str.lower()

def remove_accented_characters(data: pd.Series):
    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))

def remove_html_encodings(data: pd.Series):
    return data.str.replace(r"\d+;", " ", regex=True)

def remove_html_tags(data: pd.Series):
    return data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

def remove_url(data: pd.Series):
    return data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

def remove_html_and_url(data: pd.Series):
    data.str.replace(r"\d+;", " ", regex=True)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)
    return data

def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)
                     
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)

def fix_contractions(data: pd.Series):
    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])
    return data.apply(contraction_fixer)

def remove_special_words(data: pd.Series):
    return data.str.replace(r"\-[^a-zA-Z]{3}\-", " ", regex=True)
                     
data_cleaning_pipeline = {
    "sentences": [
        to_lower,
        remove_special_words,
        remove_accented_characters,
        remove_html_encodings,
        remove_html_tags,
        remove_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = clauses_df.copy()

# for col, pipeline in data_cleaning_pipeline.items():
#     temp_data = cleaned_data[col].copy()
#     for func in pipeline:
#         print(f"Starting: {func.__name__}")
#         temp_data = func(temp_data)
#         print(f"Ended: {func.__name__}")
#     cleaned_data[col] = temp_data.copy()

In [8]:
cleaned_data.document.unique()

array(['Viber.txt', 'Nintendo.txt', 'Tinder.txt', 'Dropbox.txt',
       'Microsoft.txt', 'Betterpoints_UK.txt', 'Airbnb.txt',
       'musically.txt', 'Crowdtangle.txt', 'TripAdvisor.txt',
       'Deliveroo.txt', 'Moves-app.txt', 'Spotify.txt', 'Supercell.txt',
       '9gag.txt', 'Booking.txt', 'Headspace.txt', 'Fitbit.txt',
       'Syncme.txt', 'Vimeo.txt', 'Oculus.txt', 'Endomondo.txt',
       'Instagram.txt', 'LindenLab.txt', 'WorldOfWarcraft.txt',
       'YouTube.txt', 'Academia.txt', 'Yahoo.txt', 'WhatsApp.txt',
       'Google.txt', 'Zynga.txt', 'Facebook.txt', 'Amazon.txt',
       'Vivino.txt', 'Netflix.txt', 'PokemonGo.txt', 'Skype.txt',
       'Snap.txt', 'eBay.txt', 'Masquerade.txt', 'Twitter.txt',
       'LinkedIn.txt', 'Skyscanner.txt', 'Duolingo.txt', 'TrueCaller.txt',
       'Uber.txt', 'Rovio.txt', 'Atlas.txt', 'Evernote.txt', 'Onavo.txt'],
      dtype=object)

In [9]:
logo = LeaveOneGroupOut()
X = cleaned_data['sentences']
y = cleaned_data['label']
group = cleaned_data['document']
logo.get_n_splits(X, y, group)

50

In [10]:
train_val_test = []
for train_val_index, test_index in logo.split(X, y, group):
    train_val, test = cleaned_data.iloc[train_val_index], cleaned_data.iloc[test_index]
    train_val_test.append((train_val, test))

In [11]:
ngram_ranges = [(1,1), (1,2), (2,2), (1,3), (2,3), (3,3)]
scores_compare = {}
for nrange in ngram_ranges:
    scores = []
    for batch in train_val_test:
        X_train = batch[0]["sentences"]
        y_train = batch[0]["label"]
        train_groups = batch[0]["document"]
        X_test = batch[1]["sentences"]
        y_test = batch[1]["label"]
    
        test_document = batch[1].document.unique()[0]
        vectorizer = TfidfVectorizer(lowercase = True, ngram_range = nrange)
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        y_train = y_train.astype('int')
        y_test = y_test.astype('int')
    
        svm = LinearSVC(random_state=0, max_iter = 5000)
        Cs = [0.001, 0.01, 0.1, 1, 10]
        clf = GridSearchCV(estimator=svm, param_grid=dict(C=Cs),n_jobs=-1, scoring = 'f1', refit = True)
        clf.fit(X_train, y_train, groups = train_groups)
        score = clf.score(X_test, y_test)
        scores.append(score)
    average_test_f1_score = sum(scores)/len(scores)
    scores_compare[nrange] = average_test_f1_score

scores_compare

{(1, 1): 0.7277559040443506,
 (1, 2): 0.7628525372712782,
 (2, 2): 0.7507132632645692,
 (1, 3): 0.7807430838056204,
 (2, 3): 0.7672602063181105,
 (3, 3): 0.7532668958661458}

In [12]:
print("Average F1 score from Leave one out document : " + str(max(scores_compare.values())))

Average F1 score from Leave one out document : 0.7807430838056204


In [13]:
print("N grams that gave maximum F1 score: " + str(max(scores_compare, key=scores_compare.get)))

N grams that gave maximum F1 score: (1, 3)
