# Part 1: Preparing data

Read document -> clean text -> convert to phrases -> labelling

### Import Library

In [1]:
import docx
from docx import Document
import re
import string
import numpy as np
import pandas as pd
import nltk

## 1. Read documents and clean text data

### Define a function that read docx file and save as text. Remove special punctuations and characters.

In [2]:
# Define a function 
def read_file(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        sentence = para.text
        sentence = sentence.lower()  # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)  #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#|,|)|(|\|/$%\n\t.:;""*]',r'',sentence) #Removing Punctuations
        fullText.append(sentence)
    while '' in fullText:
        fullText.remove('') # remove empty line
    return fullText

### Read all document inside the folder

In [3]:
import os

directory = "doc_folder/"

filesno=0
text = []
filelist = os.listdir(directory)

for docname in filelist:
    fullpath = directory+docname
    if fullpath.endswith("docx"):
        print(fullpath)
        filesno+=1
        text += read_file(fullpath)
        
print("Total number of files",filesno)


doc_folder/document_6.docx
doc_folder/document_7.docx
doc_folder/document_1.docx
doc_folder/document_2.docx
doc_folder/document_3.docx
doc_folder/document_8.docx
doc_folder/document_4.docx
doc_folder/document_5.docx
Total number of files 8


In [4]:
## for windows
# import io 
# with io.open("", "r", encoding="utf-8") as f:
#     for

## 2. Convert long text into short phrases 

In [5]:
'''
These function below are for supporting'text_to_phrases' function
'''
# identify all possible phrases
def key_words_phrases(raw):
    ngramlist=[]
    x=minlen
    ngramlimit = maxlen
    tokens=nltk.word_tokenize(raw)

    while x <= ngramlimit:
        ngramlist.extend(nltk.ngrams(tokens, x))
        x+=1
    return ngramlist

# join words into a new list
def concat_words(wordlist):
    new_list = []
    for words in wordlist:
        new_list.append(' '.join(words))   
    return new_list

'''
text into phrases
'''
# define maximum and minimum number of words in one phrase
maxlen = 10 
minlen = 6 

def text_to_phrases(text):
    phrases = []
    for sentence in text:
        if len(str(sentence).split(' ')) <= maxlen:
            phrases.append(sentence) 
        else:
            wordlist = key_words_phrases(sentence)
            phrases += concat_words(wordlist)
    
    print(len(phrases))
    return phrases

In [6]:
# list of phrases
phrases = text_to_phrases(text)

189059


## ------- Until here is read text in the documents and convert text into phrases only---------

## ------------- Bellow is ---------------
- import labeled text(long text) 
- clean text
- convert long sentences into short phrases
- labelling phrases

In [7]:
# load labeled text
traina = pd.read_csv("labeled_text.csv")

In [8]:
maxlen = 10
minlen = 6

phrases = []
labels = []

for i in range(train.shape[0]):
    sentence = train.loc[i,'text']
    sentence = sentence.lower()
    sentence = re.sub(r'[?|!|\'|"|#|,|)|(|\|/$%\n\t.:;""‘’]',r'',sentence)
    
    label = []
    
    if len(str(sentence).split(' ')) <= maxlen: # short sentence do not need to break down phrases
        phrases.append(sentence) 
        
        labels.append(train.loc[i,'label']) # append label to the phrase with text label
        
    else:
        wordlist = key_words_phrases(sentence)  # break down long sentence to several phrases
        phrases += concat_words(wordlist)
        
        label.append(train.loc[i,'label'])
        labels += label*len(wordlist)   # append same labels to all the phrases from the same text.

train_df = pd.DataFrame()
train_df['phrases'] = phrases  
train_df['labels'] = labels

print(train_df.shape)

(186760, 2)


In [9]:
# export
train_df.to_csv("train_data.csv", index=False)