In [1]:
# Convolutional Neural Network

# Installing Theano
# pip install --upgrade --no-deps git+git://github.com/Theano/Theano.git

# Installing Tensorflow
# pip install tensorflow

# Installing Keras
# pip install --upgrade keras

# Part 1 - Building the CNN

# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense

Using TensorFlow backend.


In [2]:


# Initialising the CNN
classifier = Sequential()

# Step 1 - Convolution
# Convolution - input image, applying feature detectors => feature map
# 3D Array because colored images
classifier.add(Conv2D(32, (3, 3), input_shape = (64, 64, 3), activation = 'relu'))

# Step 2 - Pooling
# Feature Map - Take Max -> Pooled Feature Map, reduced size, reduce complexity
# without losing performance, don't lose spatial structure
classifier.add(MaxPooling2D(pool_size = (2, 2)))

# Adding second convolution layer
# don't need to include input_shape since we're done it
classifier.add(Conv2D(32, (3, 3), activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))







In [3]:

# Step 3 - Flattening
# Pooled Feature Maps apply flattening maps to a huge vector 
# for a future ANN that is fully-conntected
# Why don't we lose spatial structure by flattening?
# We don't because the high numbers from convolution feature from the feature detector
# Max Pooling keeps them these high numbers, and flattening keeps these high numbers
# Why didn't we take all the pixels and flatten into a huge vector?
# Only pixels of itself, but not how they're spatially structured around it
# But if we apply convolution and pooling, since feature map corresponds to each feature 
# of an image, specific image unique pixels, we keep the spatial structure of the picture.
classifier.add(Flatten())


# Step 4 - Full Connection
classifier.add(Dense(units = 128, activation = 'relu'))
classifier.add(Dense(units = 1, activation = 'sigmoid'))

# Compile - SGD, Loss Function, Performance Metric
# Logarithmic loss - binary cross entropy, more than two outcomes, categorical cross entropy
# Metrics is the accuracy metric
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# part 2 - Fitting the CNN to the images 
# Keras preprocessing images to prevent overfitting, image augmentation, 
# great accuracy on training poor results on test sets
# Need lots of images to find correlations, patterns in pixels
# Find patterns in pixels, 10000 images, 8000 training, not much exactly or use a trick
# Image augmentation will create batches and each batch will create random transformation
# leading to more diverse images and more training
# Image augmentation allows us to enrich our dataset to prevent overfitting




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [4]:
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

training_set = train_datagen.flow_from_directory('/home/dev2/Sanjana/Metadata Extraction/dataset/train',
                                                 target_size=(64, 64),
                                                 batch_size=32,
                                                 class_mode='binary')

test_set = test_datagen.flow_from_directory('/home/dev2/Sanjana/Metadata Extraction/dataset/test',
                                            target_size=(64, 64),
                                            batch_size=32,
                                            class_mode='binary')

classifier.fit_generator(training_set,
                        samples_per_epoch=100,
                        nb_epoch=50,
                        validation_data=test_set,
                        nb_val_samples=200)




# Saving the model
model_json = classifier.to_json()
with open("/home/dev2/Sanjana/Metadata Extraction/models/model.json", "w") as json_file :
    json_file.write(model_json)

classifier.save_weights("/home/dev2/Sanjana/Metadata Extraction/models/model.h5")
print("Saved model to disk")

classifier.save('/home/dev2/Sanjana/Metadata Extraction/models/CNN.model')


Found 262 images belonging to 2 classes.
Found 10 images belonging to 2 classes.





Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Saved model to disk


In [10]:

# Part 3 - Making new predictions

import numpy as np
from keras.preprocessing import image
import docx2txt
import glob, os, shutil
import cv2
import tensorflow as tf
docxnope=[]
legalok=[]
doc_name=[]

classifier = tf.keras.models.load_model("/home/dev2/Sanjana/Metadata Extraction/models/CNN.model")

path=r'/home/dev2/1tb/SANJANA/Word/'
os.chdir(r'/home/dev2/1tb/SANJANA/Word/')
for file in glob.glob("*.docx"):
    doc_name.append(file)
    dir_name=file.replace('.docx','')
    try:
        #print("errorpoint1")
        os.makedirs(path+dir_name)
        #print("dir made",dir_name)
        img_dir=path+'/'+dir_name
        nope_dir=img_dir+'/'+dir_name
        os.makedirs(nope_dir)
        text = docx2txt.process(path+file, img_dir) 
        data_path = os.path.join(img_dir,'*.png')
        files = glob.glob(data_path)
        #print("errorpoint2")
        data = []
        for f1,imgid in zip(files,range(1,len(files))):
            a=f1.replace('.png',"")
            #print(dir_name)
            the_img=a+'__'+dir_name+'.png'
            #print(the_img)
            if os.path.isfile(f1):
                os.rename(f1,the_img)
                shutil.copy(the_img,r'/home/dev2/1tb/SANJANA/all_images') 
    except:
        print("error in doc name",file)
        
#         test_image = image.load_img(f1, target_size=(64, 64))
#         test_image = image.img_to_array(test_image)
#         test_image = np.expand_dims(test_image, axis = 0)
#         result = classifier.predict(test_image)
#         if result[0][0] == 1: 
#             #print("legalok")
#             data.append('legalok')
#             docxnope.append(f1)

#         else:
#             #print("nope")
#             data.append('nope')
#     except:
#         print("someerror")
#         pass
#     for i in docxnope:
#         if os.path.isfile(i):
#             shutil.copy(i, nope_dir)
#         else:
#             print ("file does not exist", i)
#             print(i)    
#     if "legalok" in data:
#         indices = [i for i, x in enumerate(data) if x == "legalok"]
#         #print(file,"yo present! at path",indices)
#         legalok.append("yes")
#     else:
#         #print(file,"not present")
#         legalok.append("no")

# import pandas as pd
# df=pd.read_excel(r'/home/dev2/Sanjana/legalok_1stcut.xlsx')
# df['Document_Name']= doc_name
# df['legal ok']=legalok
print("done")

done


In [None]:
import pandas as pd
df=pd.read_excel(r"/home/dev2/Sanjana/Stamp/trainingagain.xlsx")

In [11]:
print("fi")

fi


In [None]:
df['ERRORDOCS']=docxnope

df.to_excel(r"/home/dev2/Sanjana/Stamp/trainingagain.xlsx")

In [None]:
print("hi")

In [None]:
import glob, os, shutil
for i in docxnope:
    print(docxnope)
    if os.path.isfile(i):
        print("yes")
        shutil.copy(i, nope_dir)


In [None]:
for i in range(1,5):
    print(type(str(i)))

In [None]:
()

In [1]:
import numpy as np
from keras.preprocessing import image
import docx2txt
import glob, os, shutil
import cv2
import tensorflow as tf
import os,glob
data_path = os.path.join(r'/home/dev2/1tb/SANJANA/Word','*.png')
files = glob.glob(data_path)
data=[]
docxnope=[]
print(len(files))
for f1 in files:
    test_image = image.load_img(f1, target_size=(64, 64))
    test_image = image.img_to_array(test_image)
    test_image = np.expand_dims(test_image, axis = 0)
    result = classifier.predict(test_image)
    print(result)
    if result[0][0] == 1: 
        #print("legalok")
        data.append('legalok')
        docxnope.append(f1)
        if os.path.isfile(f1):
            shutil.copy(f1, r'/home/dev2/1tb/SANJANA/all_images')                

    else:
        pass
        #print("nope")
        #data.append('nope')

Using TensorFlow backend.


0


In [None]:
df.to_excel(r'/home/dev2/Sanjana/legalok_1stcut.xlsx')

In [2]:
print("done")

done
