In [1]:
import os
import pandas as pd

In [2]:
class DatasetPandas:
    def __init__(self, path):
        with open(path, 'r') as dataset:
            self.dataset = pd.read_csv(dataset)
        _, filename = os.path.split(path)
        print("Successfully loaded", filename)

    def get_dataset(self):
        return self.dataset
        
    def get_splits(self):
        train = self.dataset[self.dataset.split == 'train']
        test = self.dataset[self.dataset.split == 'test']
        dev = self.dataset[self.dataset.split == 'dev']
        print('Returned...')
        print(f'Train Length: {len(train)}\nTest Length: {len(test)}\nDev Length: {len(dev)}')
        return train, test, dev

In [3]:
import re
import nltk
from nltk.corpus import stopwords
import string
cachedStopWords = stopwords.words("english")


def remove_dates(text):
    pattern = '(\b(0?[1-9]|[12]\d|30|31)[^\w\d\r\n:](0?[1-9]|1[0-2])[^\w\d\r\n:](\d{4}|\d{2})\b)|(\b(0?[1-9]|1[0-2])[^\w\d\r\n:](0?[1-9]|[12]\d|30|31)[^\w\d\r\n:](\d{4}|\d{2})\b)'
    pattern2 = '(0?[1-9]|[12]\d|30|31)[^\w\d\r\n:](0?[1-9]|1[0-2])[^\w\d\r\n:](\d{4}|\d{2})'

    text = re.sub(pattern, 'DATE', text)
    text = re.sub(pattern2, 'DATE', text)
    return text


def remove_punctuation(text):
    return ''.join([word for word in text if word not in string.punctuation])

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in cachedStopWords])

def tokenize(text):
    return nltk.word_tokenize(text)

def no_preprocess(text):
    text = text.lower()
    return tokenize(text)

def no_stopwords(text):
    text = text.lower()
    text = remove_stopwords(text)
    return tokenize(text)

def no_sw_punc(text):
    text = text.lower()
    text = remove_stopwords(text)
    text = remove_punctuation(text)
    return tokenize(text)

def preprocess_all(text):
    text = text.lower()
    text = remove_dates(text)
    text = remove_stopwords(text)
    text = remove_punctuation(text)
    return tokenize(text)

In [4]:
def get_dataset_stat(tokenized_text):
    total_tokenized_words = 0
    for i in range(len(tokenized_text)):
        total_tokenized_words += len(tokenized_text[i])
    avg_tokens = total_tokenized_words/len(tokenized_text)
    print('total tokenized words: ', total_tokenized_words)
    print('avg tokens: ', avg_tokens)
    return total_tokenized_words, avg_tokens

In [5]:
single_path = 'ILDC_single/ILDC_single.csv'
single_dataset_loader = DatasetPandas(single_path)
train, test, dev = single_dataset_loader.get_splits()

Successfully loaded ILDC_single.csv
Returned...
Train Length: 5082
Test Length: 1517
Dev Length: 994


In [6]:
single = single_dataset_loader.get_dataset()
single

Unnamed: 0,text,label,split,name
0,"F. NARIMAN, J. Leave granted. In 2008, the Pu...",1,train,2019_890.txt
1,"S. THAKUR, J. Leave granted. These appeals ar...",0,train,2014_170.txt
2,"Markandey Katju, J. Leave granted. Heard lear...",1,train,2010_721.txt
3,"ALTAMAS KABIR,J. Leave granted. The question ...",1,train,2008_1460.txt
4,"CIVIL APPEAL NO. 598 OF 2007 K. MATHUR, J. Th...",1,train,2008_188.txt
...,...,...,...,...
7588,civil appellate jurisdiction civil appeal numb...,1,dev,1985_121.txt
7589,criminal appellate jurisdiction special leave\...,0,dev,1984_228.txt
7590,civil appellate jurisdiction civil appeal numb...,0,dev,1965_317.txt
7591,civil appellate jurisdiction civil appeal numb...,1,dev,1976_191.txt


In [7]:
single['tokenized_text_all'] = single.text.apply(preprocess_all)
single['tokenized_text_no_sw_punc'] = single.text.apply(no_sw_punc)
single['tokenized_text_no_stopwords'] = single.text.apply(no_stopwords)
single['tokenized_text_npp'] = single.text.apply(no_preprocess)

In [8]:
print("All Preprocess")
num_tokens_single_all, avg_tokens_single_all = get_dataset_stat(single.tokenized_text_all)
print("No Stopwords, Punctuation")
num_tokens_single_swpun, avg_tokens_single_swpun = get_dataset_stat(single.tokenized_text_no_sw_punc)
print("No Stopwords")
num_tokens_single_sw, avg_tokens_single_sw = get_dataset_stat(single.tokenized_text_no_stopwords)
print("No preprocess")
num_tokens_single_npp, avg_tokens_single_npp = get_dataset_stat(single.tokenized_text_npp)

All Preprocess
total tokenized words:  15752521
avg tokens:  2074.610957460819
No Stopwords, Punctuation
total tokenized words:  15753733
avg tokens:  2074.7705781640984
No Stopwords
total tokenized words:  17629647
avg tokens:  2321.828921374951
No preprocess
total tokenized words:  31788864
avg tokens:  4186.601343342552


In [9]:
multi_path = 'ILDC_multi/ILDC_multi.csv'
multi_dataset_loader = DatasetPandas(multi_path)
train_m, test_m, dev_m = multi_dataset_loader.get_splits()

Successfully loaded ILDC_multi.csv
Returned...
Train Length: 32305
Test Length: 1517
Dev Length: 994


In [10]:
multi = multi_dataset_loader.get_dataset()
multi

Unnamed: 0,text,label,split,name
0,"Uday Umesh Lalit, J. These appeals arise out ...",0,train,2020_1.txt
1,"Indira Banerjee, J. These appeals are against...",0,train,2020_2.txt
2,TABLE OF CONTENTS Introduction A Contentions B...,0,train,2020_3.txt
3,"Dinesh Maheshwari, J. Introductory with brief...",0,train,2020_4.txt
4,"Dinesh Maheshwari, J. Preliminary By way of t...",0,train,2020_5.txt
...,...,...,...,...
34811,"Dr. B. S. CHAUHAN, J. This appeal has been pr...",1,test,2013_101.txt
34812,"Rajendra Babu, J. The respondents, who are co...",0,test,2000_1559.txt
34813,Leave granted. Respondent 1 Insurance Company...,1,test,1998_83.txt
34814,Leave granted. The respondent is an Executive...,1,test,1995_2.txt


In [11]:
multi['tokenized_text_all'] = multi.text.apply(preprocess_all)
multi['tokenized_text_no_sw_punc'] = multi.text.apply(no_sw_punc)
multi['tokenized_text_no_stopwords'] = multi.text.apply(no_stopwords)
multi['tokenized_text_npp'] = multi.text.apply(no_preprocess)

In [12]:
print("All Preprocess")
num_tokens_multi_all, avg_tokens_multi_all = get_dataset_stat(multi.tokenized_text_all)
print("No Stopwords, Punctuation")
num_tokens_multi_swpun, avg_tokens_multi_swpun = get_dataset_stat(multi.tokenized_text_no_sw_punc)
print("No Stopwords")
num_tokens_multi_sw, avg_tokens_multi_sw = get_dataset_stat(multi.tokenized_text_no_stopwords)
print("No preprocess")
num_tokens_multi_npp, avg_tokens_multi_npp = get_dataset_stat(multi.tokenized_text_npp)

All Preprocess
total tokenized words:  59853190
avg tokens:  1719.128848805147
No Stopwords, Punctuation
total tokenized words:  59857722
avg tokens:  1719.2590188419117
No Stopwords
total tokenized words:  67790830
avg tokens:  1947.117130055147
No preprocess
total tokenized words:  119470780
avg tokens:  3431.4906939338234


In [13]:
def add_paragraphs(data):
    new_data = []
    for text in data:
        dummy = ''
        for i in range(len(text)-1):
            dummy += text[i][:-1]
        new_data.append(dummy)
    return new_data

In [14]:
expert_path = 'ILDC_expert/source/'
print(os.listdir(expert_path))

['2013_35.txt', '1962_384.txt', '2003_794.txt', '1954_144.txt', '1962_47.txt', '1959_189.txt', '1999_1001.txt', '1960_327.txt', '1953_74.txt', '2013_30.txt', '1959_5.txt', '1960_44.txt', '1960_87.txt', '1951_64.txt', '1961_344.txt', '2013_95.txt', '2013_57.txt', '1959_66.txt', '1962_118.txt', '1953_14.txt', '1951_10.txt', '1959_76.txt', '1952_42.txt', '1961_365.txt', '1961_417.txt', '1951_36.txt', '1962_113.txt', '1962_339.txt', '1953_26.txt', '1954_13.txt', '1961_400.txt', '1951_35.txt', '1962_105.txt', '1951_30.txt', '1954_0.txt', '1961_363.txt', '1952_75.txt', '1960_10.txt', '1959_134.txt', '1962_128.txt', '1952_60.txt', '1951_33.txt', '1954_114.txt', '1960_12.txt', '1962_213.txt', '1962_207.txt', '1951_40.txt', '1960_100.txt', '1959_26.txt', '1954_158.txt', '1951_80.txt', '1960_103.txt', '1960_265.txt', '1953_57.txt', '1960_72.txt', '1963_37.txt']


In [15]:
data = []
for file in os.listdir(expert_path):
    with open(expert_path+file, 'r') as f:
        data.append(f.readlines())
print(len(data))

56


In [16]:
new_data = add_paragraphs(data)
len(new_data)

56

In [17]:
data_tokenized_all = []
data_tokenized_no_sw_punc= []
data_tokenized_no_stopwords = []
data_tokenized_npp = []

for i in new_data:
    data_tokenized_all.append(preprocess_all(i))
    data_tokenized_no_sw_punc.append(no_sw_punc(i))
    data_tokenized_no_stopwords.append(no_stopwords(i))
    data_tokenized_npp.append(no_preprocess(i))


In [18]:
print("All Preprocess")
num_tokens_expert_all, avg_tokens_expert_all = get_dataset_stat(data_tokenized_all)
print("No Stopwords, Punctuation")
num_tokens_expert_swpun, avg_tokens_expert_swpun = get_dataset_stat(data_tokenized_no_sw_punc)
print("No Stopwords")
num_tokens_expert_sw, avg_tokens_expert_sw = get_dataset_stat(data_tokenized_no_stopwords)
print("No preprocess")
num_tokens_expert_npp, avg_tokens_expert_npp = get_dataset_stat(data_tokenized_npp)

All Preprocess
total tokenized words:  85829
avg tokens:  1532.6607142857142
No Stopwords, Punctuation
total tokenized words:  85876
avg tokens:  1533.5
No Stopwords
total tokenized words:  101241
avg tokens:  1807.875
No preprocess
total tokenized words:  185296
avg tokens:  3308.8571428571427


In [24]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split


from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
# from pytorch_transformers import AdamW

# from tqdm import tqdm, trange
# import pandas as pd
# import io
# import numpy as np
# import matplotlib.pyplot as plt
# % matplotlib inline


In [25]:
for text in train.text:
    print(text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [26]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(text) for text in train.text]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])


100%|██████████| 798011/798011 [01:00<00:00, 13176.52B/s]


Tokenize the first sentence:
['▁f', '.', '▁', 'na', 'ri', 'man', ',', '▁', 'j', '.', '▁leave', '▁granted', '.', '▁in', '▁2008', ',', '▁the', '▁', 'pun', 'ja', 'b', '▁state', '▁water', '▁supply', '▁sewer', 'age', '▁board', ',', '▁', 'bha', 't', 'inda', '▁issued', '▁number', 'ice', '▁inviting', '▁tender', '▁for', '▁extension', '▁and', '▁augment', 'ation', '▁of', '▁water', '▁supply', ',', '▁sewer', 'age', '▁scheme', ',', '▁pumping', '▁station', '▁and', '▁sewer', 'age', '▁treatment', '▁plant', '▁for', '▁various', '▁towns', '▁mentioned', '▁there', 'in', '▁on', '▁a', '▁turn', 'key', '▁basis', '.', '▁on', '▁25', '.', '9', '.', '2008', ',', '▁the', '▁app', 'ell', 'ant', '▁company', 'pan', 'y', ',', '▁which', '▁is', '▁signature', '▁not', '▁verified', '▁involved', '▁in', '▁civil', '▁electrical', '▁works', '▁in', '▁in', 'dia', ',', '▁was', '▁awarded', '▁the', '▁said', '▁digital', 'ly', '▁signed', '▁by', '▁', 'ni', 'dhi', '▁a', 'hu', 'ja', '▁date', '▁2019', '.', '03', '.', '11', '▁17', '33', '59',

In [None]:
tokenized_texts_single = [tokenizer.tokenize(text) for text in single.text]

In [None]:
total = 0
for text in tokenized_texts_single:
    total += len(text)
print(total/len(single))