# Importing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
full_dataset = pd.read_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/combined_data.csv')

In [None]:
full_dataset.head()

Unnamed: 0,text,label
0,Citation: Springfield Capital Inc. v Grande Pr...,party
1,"Appearances: K.D. Wakefield, Q.C. for the Appl...",council
2,The Honourable Mr. Justice Frans Slatter,judge
3,"Conclusion [22] In conclusion, the applicant h...",outcome
4,In the Court of Appeal of Albert,court


# Data Cleaning

In [None]:
full_dataset['label'].value_counts()

judge         257
outcome       241
facts         210
council       193
party         165
court         152
counsel        89
casename       44
Judge          26
Outcome        21
Counsel        20
fact           16
Court          10
Casename       10
Facts           6
Council         2
conclusion      1
outcome         1
Name: label, dtype: int64

In [None]:
full_dataset['label'] = full_dataset['label'].str.strip()
full_dataset['label'].replace('Judge', 'judge', inplace = True)
full_dataset['label'].replace('Outcome', 'outcome', inplace = True)
full_dataset['label'].replace('Council', 'counsel', inplace = True)
full_dataset['label'].replace('council', 'counsel', inplace = True)
full_dataset['label'].replace('party', 'casename', inplace = True)
full_dataset['label'].replace('Conclusion', 'outcome', inplace = True)
full_dataset['label'].replace('Party', 'casename', inplace = True)
full_dataset['label'].replace('Facts', 'fact', inplace = True)
full_dataset['label'].replace('fact', 'facts', inplace = True)
full_dataset['label'].replace('conclusion', 'outcome', inplace = True)
full_dataset['label'].replace('Counsel', 'counsel', inplace = True)
full_dataset['label'].replace('Court', 'court', inplace = True)
full_dataset['label'].replace('conclusion', 'outcome', inplace = True)
full_dataset['label'].replace('Casename', 'casename', inplace = True)

In [None]:
full_dataset['label'].value_counts()

counsel     304
judge       283
outcome     264
facts       232
casename    219
court       162
Name: label, dtype: int64

In [None]:
full_dataset.shape

(1464, 2)

In [None]:
full_dataset['text'] = full_dataset['text'].replace('\\n','', regex=True)

In [None]:
full_dataset.head()

Unnamed: 0,text,label
0,Citation: Springfield Capital Inc. v Grande Pr...,casename
1,"Appearances: K.D. Wakefield, Q.C. for the Appl...",counsel
2,The Honourable Mr. Justice Frans Slatter,judge
3,"Conclusion [22] In conclusion, the applicant h...",outcome
4,In the Court of Appeal of Albert,court


# Dividing Dataset into train and test

In [None]:
from numpy.random import RandomState
rand = RandomState()

train_valid = full_dataset.sample(frac=0.9, random_state=rand)

train = train_valid.sample(frac=0.77, random_state=rand)
valid = train_valid.loc[~train_valid.index.isin(train.index)]

test = full_dataset.loc[~full_dataset.index.isin(train_valid.index)]

In [None]:
train = pd.read_csv("/content/drive/My Drive/Loom Analytics NLP 2/dataset/train.csv")
valid = pd.read_csv("/content/drive/My Drive/Loom Analytics NLP 2/dataset/validate.csv")
test = pd.read_csv("/content/drive/My Drive/Loom Analytics NLP 2/dataset/test.csv")


In [None]:
train['text'] = train['text'].replace('\\n','', regex=True)
valid['text'] = valid['text'].replace('\\n','', regex=True)
test['text'] = test['text'].replace('\\n','', regex=True)

In [None]:
train.dropna(subset = ['text'], inplace=True)
valid.dropna(subset = ['text'], inplace=True)
test.dropna(subset = ['text'], inplace=True)

In [None]:
print(train.shape)
print(valid.shape)
print(test.shape)

(1364, 2)
(454, 2)
(455, 2)


In [None]:
train.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/train.csv', index = False)
valid.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/valid.csv', index = False)
test.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/test.csv', index = False)

# Generating Train and Test for BERT

In [None]:
train_bert = train.copy()
valid_bert = valid.copy()
test_bert = test.copy()

In [None]:
train['label'].value_counts()

counsel     282
facts       243
outcome     241
judge       230
dec_name    198
court       170
Name: label, dtype: int64

In [None]:
train_bert['label'] = train_bert['label'].replace(['dec_name', 'counsel', 'court', 'facts', 'judge', 'outcome'], [0,1,2,3,4,5])
valid_bert['label'] = valid_bert['label'].replace(['dec_name', 'counsel', 'court', 'facts', 'judge', 'outcome'], [0,1,2,3,4,5])
test_bert['label'] = test_bert['label'].replace(['dec_name', 'counsel', 'court', 'facts', 'judge', 'outcome'], [0,1,2,3,4,5])

In [None]:
print(train_bert.shape)
print(valid_bert.shape)
print(test_bert.shape)

(1364, 2)
(454, 2)
(455, 2)


In [None]:
train_bert.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/train_bert.csv', index = False)
valid_bert.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/validate_bert.csv', index = False)
test_bert.to_csv('/content/drive/My Drive/Loom Analytics NLP 2/dataset/test_bert.csv', index = False)