### Notebook Summary :
- This notebookcan can read a folder of word documents and determine which ones are letters and which are contracts.

### Import Libraries

In [1]:
### pre requisites - 
#! pip install docx2txt
#! pip install gensim
#! pip install nltk

import pandas as pd
import os
import nltk
import docx2txt
import re
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
import gensim
from gensim.models.doc2vec import TaggedDocument

from nltk.corpus import stopwords
stopwords_en = stopwords.words("english")

### List All docx files in folder

In [2]:
folder_name = "test_docs"
file_list = []
df = pd.DataFrame() 
path = os.getcwd()
folder_path = path+"\\"+folder_name+"\\"
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.docx'):
            file_list.append(file)

In [3]:
file_list

['sample (1).docx',
 'sample (2).docx',
 'sample (3).docx',
 'sample (4).docx',
 'sample (5).docx',
 'sample (6).docx',
 'sample (7).docx',
 'sample (8).docx']

### Text Cleaning 

In [4]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def processing(raw):
    p = r"[0-9]"
    raw = re.sub(p,'',raw)
    punc = '''!()-[]{};:'"“”\,<>./?@#$%^&*_~'''
    my_stop_words = ['say', '\s', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be','shall','would','mrs']

    for ele in raw:
        if ele in punc:
            raw = raw.replace(ele, "")
    wordlist = nltk.word_tokenize(raw)
    wordlist = [w.lower() for w in wordlist]
    text = [w for w in wordlist if w not in stopwords_en and w not in my_stop_words]
    tokens = [get_lemma(w) for w in wordlist]
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\0035RO744\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
f0 = docx2txt.process(folder_path +file_list[0])
text0 = processing(f0)
f1 = docx2txt.process(folder_path +file_list[1])
text1 = processing(f1)
f2 = docx2txt.process(folder_path +file_list[2])
text2 = processing(f2)
f3 = docx2txt.process(folder_path +file_list[3])
text3 = processing(f3)
f4 = docx2txt.process(folder_path +file_list[4])
text4 = processing(f4)
f5 = docx2txt.process(folder_path +file_list[5])
text5 = processing(f5)
f6 = docx2txt.process(folder_path +file_list[6])
text6 = processing(f6)
f7 = docx2txt.process(folder_path +file_list[7])
text7 = processing(f7)

### Train Model

In [6]:
taggeddocs = []
doc1 = TaggedDocument(words = text0, tags=[u'Letter'])
taggeddocs.append(doc1)
doc2 = TaggedDocument(words = text3, tags=[u'Contract'])
taggeddocs.append(doc2)

In [7]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(taggeddocs)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(taggeddocs)]), total_examples=len(taggeddocs), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 1996.34it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 1993.49it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<?,

In [8]:
#use this saved model to get the tag for any unseen document.
model_dbow.save('dbow.doc2vec')

### Output

In [9]:
# Assigning tag to each file 
# the structure would be filename : type of file identified from its content(letters/Contract)
tag_dict = {}
parsed_text_list = [text0,text1,text2,text3,text4,text5,text6,text7]
for i in range(0,len(file_list)):
    #print(file_list[i])
    new_doc_vec = model_dbow.infer_vector(parsed_text_list[i], alpha=0.10)
    x = model_dbow.dv.most_similar(positive=[new_doc_vec])
    tag, value = max(x, key=lambda item: item[1])
    tag_dict.update({file_list[i]:tag})


In [10]:
tag_dict

{'sample (1).docx': 'Letter',
 'sample (2).docx': 'Letter',
 'sample (3).docx': 'Letter',
 'sample (4).docx': 'Contract',
 'sample (5).docx': 'Contract',
 'sample (6).docx': 'Contract',
 'sample (7).docx': 'Contract',
 'sample (8).docx': 'Contract'}

In [11]:
#End of notebook