# Data Pre Processing

In [1]:
import pymongo
import json
import re
import string
from tqdm.notebook import tqdm 
import nltk
from nltk import sent_tokenize, word_tokenize

punctuations = list(string.punctuation)

In [2]:
client = pymongo.MongoClient("127.0.0.1:27017")
db = client['IRsegmentationDB3']

# This dataset contains documents treated by Annotation Correction, Annotation Filtering and 
# non ascii char removing
dataset = db['dataset']

# This dataset contains the documents (by dataset) segmented in paragraph based on annotations
# This dataset is used only for Supervised Learning 
pDataset = db['pDataset']

# This dataset contains the documents split in tokens. This token are filtered to make sure that
# they are not punctuations and that are lowercase. 
uDict = db['unsDict']

In [3]:
f = open('./Dataset/trade_secret_cases.json')
data = json.load(f)
docs = [d for d in data]

In [4]:
for i, x in enumerate(docs):
    # Sorting annotation by 'start' index 
    annots = sorted(x['annotations'], key=lambda d: d['start']) 
    
    # Annotations correction
    for j in range(0, len(annots) - 1):
        if(annots[j]['end'] == annots[j+1]['start']):
            annots[j+1]['start'] +=1 
        elif(annots[j]['end'] > annots[j+1]['start']):
            annots[j+1]['start'] = annots[j]['end'] + 1
            
    x['annotations'] = annots
    
    # Remove non ASCII characters
    x['text'] = re.sub(r'[^\x00-\x7F]+',' ', x['text'])
    
    # Segmenting by annotations
    mp = [x['text'][a['start']:a['end']] for a in annots]
    mpf, anf = [], []
    
    # Remove Appendix and Dissent / Concurrence paragraphs from documnets and annotations
    j = 0
    for kk, (p, a) in enumerate(zip(mp, annots)):
        if a['type'] != "Appendix" and a['type'] != "Dissent/Concurrence" and len(p) > 0:
            a['index'] = j
            mpf.append(p)
            anf.append(a)
            j += 1
        
        
    mpf = "¶".join(mpf)
    mpf = x['text'][0:annots[0]['start']]+str(mpf)
    
    x['text'] = mpf
    x['doc'] = i
    x['annotations'] = anf
    
    dataset.insert_one(x)

### Supervised Data Pre Processing

In [5]:
for i, x in enumerate(dataset.find()): 
    sent = []
    
    d = (x['text']).split("¶")

    for j, p in enumerate(d): 
        s = sent_tokenize(p, language="english")
        s = [snt.lower() for snt in s]
        
        if(len(s) == 0):
            continue 
    
        # mergin too small sentence
        k = 0
        
        while(True):
            if(len(s) - 1 == k):
                break
            
            if(len(nltk.word_tokenize(s[k])) <= 5):
                s[k - 1] = s[k - 1]+" "+s[k]
                del s[k]
            else:
                k += 1
                
        sent.append(s)
        
    x['text'] = sent
    
    pDataset.insert_one(x)

### Unsupervised Data Preprocessing

In [6]:
for i, d in enumerate(dataset.find()):
    text = d['text']
    tokens = nltk.word_tokenize(text)   
    tokens = [w.lower().strip() for w in tokens ]
    tokens = [x.replace("¶", "") for x in tokens if x not in punctuations]
    
    uDict.insert_one({"doc": i, "text": tokens, "title": d['name']})