In [1]:
# do not run when we import this package as module
if __name__ == '__main__':
  # Connect Google Drive
  from google.colab import drive
  drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
if __name__ == '__main__':
  ## Load all ipynb files from Google Drive to Colab environment
  import os
  import shutil

  def find_and_copy_files(src_folder, dest_folder, file_extension):
    for foldername, subfolders, filenames in os.walk(src_folder):
        for filename in filenames:
            if filename.endswith(file_extension):
                src_file = os.path.join(foldername, filename)
                dest_file = os.path.join(dest_folder, filename)
                try:
                    shutil.copy2(src_file, dest_file)  # Use shutil.copy if you don't need metadata
                    print(f"Copied: {src_file} to {dest_file}")
                except Exception as e:
                    print(f"Error copying {src_file}: {e}")

In [3]:
if __name__ == '__main__':
  src_folder = '/content/gdrive/MyDrive/OWR/source'
  dest_folder = '/content'
  file_extension = '.ipynb'

  # Call the function to find and copy files
  find_and_copy_files(src_folder, dest_folder, file_extension)

Copied: /content/gdrive/MyDrive/OWR/source/input_generator/owr_input_generator.ipynb to /content/owr_input_generator.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/owr_model_v2.ipynb to /content/owr_model_v2.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/owr_segmentation_v3.ipynb to /content/owr_segmentation_v3.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_CNN_classification.ipynb to /content/owr_preprocess_skew_CNN_classification.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_CNN_regression.ipynb to /content/owr_preprocess_skew_CNN_regression.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_cv.ipynb to /content/owr_preprocess_skew_cv.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocessing.ipynb to /content/owr_preprocessing.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocess

In [4]:
if __name__ == '__main__':
  from IPython.display import clear_output as cls
  !pip install import_ipynb
  import import_ipynb

  cls()

In [5]:
if __name__ == '__main__':
  # Import Colab Models
  import owr_postprocess_module as POM
  import owr_data_model_v1 as DMM
  import owr_segmentation_v3 as SWM
  import owr_preprocess_skew_tilt as PSTM

importing Jupyter notebook from owr_pre_skew_cv_base.ipynb


In [6]:
import os
import json
import random
import matplotlib.pyplot as plt
import PIL.Image as Image
import numpy as np
import math
from datetime import datetime
import pandas as pd
import cv2
from google.colab.patches import cv2_imshow
from IPython.display import clear_output as cls

from tensorflow import keras
import tensorflow as tf
from tensorflow.keras.models import load_model

In [7]:
# Global Parameters
org_img_path = '/content/gdrive/MyDrive/OWR/data/tmp_input'
modified_img_path = '/content/gdrive/MyDrive/OWR/data/input'
output_path = '/content/gdrive/MyDrive/OWR/log/printLogs/'

word_model_path = '/content/gdrive/MyDrive/OWR/dl_models/OWR_Model/Best_Img_recog_LSTM_Adam_model_run_weights.h5'
skew_model_path = '/content/gdrive/MyDrive/OWR/dl_models/skew_cnn_clas.h5'
tilt_model_path = '/content/gdrive/MyDrive/OWR/dl_models/tilted_T_cnn_clas.h5'

start_page = 40
end_page   = 129
exception_pages = [92, 119]

no_of_pages = np.arange(start_page, end_page + 1, dtype=int)

mask = ~np.isin(no_of_pages, exception_pages)

no_of_pages = no_of_pages[mask]


In [8]:
# variable decelerations
courseCode = ''
level_credit_pos = 0
courseTitle = ''
courseLevel = 0
courseCredit = 0
courseTutorDirected = 0
courseSelfDirected = 0
courseAim = ''
coursePreRequisite = []
courseLearningOutcome = []
courseAssessment = []
courseCompletion = []

word_model = None
word_model_classes = None
word_model_num_classes = None

In [9]:
def paramInitialise():
  courseCode = ''
  level_credit_pos = 0
  courseTitle = ''
  courseLevel = 0
  courseCredit = 0
  courseTutorDirected = 0
  courseSelfDirected = 0
  courseAim = ''
  coursePreRequisite = []
  courseLearningOutcome = []
  courseAssessment = []
  courseCompletion = []

In [10]:
def printParam(path, file_name, page_no):

  with open(path + file_name, 'a') as f:
    print('PageNo', page, file=f)
    print("Course Code:", courseCode, file=f)
    print("Course Title:", courseTitle, file=f)
    print("Course Level:", courseLevel, file=f)
    print("Course Credit:", courseCredit, file=f)
    print("Course Tutor Directed Learning Hours:", courseTutorDirected, file=f)
    print("Course Self-Directed Learning Hours:", courseSelfDirected, file=f)
    print("Course Aim:", courseAim, file=f)
    print("Course Pre-Requisite:", coursePreRequisite, file=f)
    print("Course Learning Outcome:", courseLearningOutcome, file=f)
    print("Course Assessment:", courseAssessment, file=f)
    print("Course Completion:", courseCompletion, file=f)
    print('-'*100, file=f)

In [11]:
def get_pageImage(path, page):
  img = None
  img_path = f'{path}/page_{page + 1}.png'
  try:
     img = cv2.imread(img_path)
  except:
     print("Image not found")

  return img

In [12]:
csv_file_name = 'input.csv'
# Load CSV file data into dataframe
def load_csv(location):
  csv_file_path = os.path.join(location, csv_file_name)

  # Read the CSV file into a DataFrame
  df = pd.read_csv(csv_file_path)

  return df


def page_type(df, page):
  search_string = f'page_{page + 1}'
  filtered_rows = df[df['FILE_NAME'].str.contains(search_string, case=False)]

  if not filtered_rows.empty:
    return filtered_rows['TYPE'].iloc[0]
  else:
    return 'T'


In [13]:
def convertToNumber(number):
  number = number.replace('S','5')
  number = number.replace('A','4')
  number = number.replace('L','1')
  number = number.replace('O','0')
  number = number.replace('I','1')
  number = number.replace('Z','2')
  number = number.replace('E','3')

  return number

In [14]:
def transform_assessment(assessment):
  output = []
  assessment = courseAssessment
  for i in range(len(courseAssessment)):
    tmpContent = []
    if i == 0:
      for j in range(1,len(courseAssessment[i])):
        tmpContent.append( ''.join(assessment[i][j]))
    elif i == 1:
      for j in range(1,len(courseAssessment[i])):
        tmpStr = ''.join(assessment[i][j]).split(' ')[0]
        tmpStr = convertToNumber(tmpStr)
        tmpContent.append(tmpStr)
    elif i == 2:
      for j in range(1,len(courseAssessment[i])):
        tmpContent2 = []
        tmpStr1 = ''.join(assessment[i][j]).split(' ')
        tmpStr2  = ''
        for string in tmpStr1:
          tmpStr2 = convertToNumber(string)
          tmpContent2.append(tmpStr2)
        tmpContent.append(tmpContent2)

    output.append(tmpContent)

  return output

In [21]:
# Load word recognition model
word_model, word_model_classes, word_model_num_classes = SWM.load_word_prediction_model(word_model_path)

dbConnection = DMM.get_db_connection('whitireia_admin', 'weltec#2023')

skew_model = load_model(skew_model_path)

tilt_model = load_model(tilt_model_path)

In [22]:
# which model to run
path_ref = int(input("Enter 1 for skewed and tilt images (\'input\') folder, 2 for Original images (\'tmp_input\') folder: "))

img_path = org_img_path
match path_ref:
  case 1:
    img_path = modified_img_path
  case 2:
    img_path = org_img_path

preprocess_ref = 3
if path_ref == 1:
  preprocess_ref = int(input("Enter 1 for OpenCV preprocessing, 2 for CNN based preprocessing or 3 for NO preprocessing: "))

cnn_df = load_csv(img_path)

# Create Base Version
dbVerID = DMM.insert_version(dbConnection, 'CNN_OWR_INPUT_CD_V3', 'PAGES WITH TILT OR SKEWING WITH OPENCV PREPROCESSING BUT WITHOUT TILT CORRECTION') # WITH OPENCV PREPROCESSING

for page in no_of_pages:
#for page in range(42,43):
  # Initialise Parameters
  paramInitialise()

  image = get_pageImage(img_path, page)

  if image is None:
    continue

  match preprocess_ref:
    case 1:
      image = PSTM.set_OpenCV_skew_tilt_corrected(image)
    case 2:
      skew_tilt_type = page_type(cnn_df, page)
      image = PSTM.set_CNN_skew_tilt_corrected(image, skew_model, tilt_model, skew_tilt_type)

  binaryImg = SWM.getBinarized(image, False)

  #1cv2_imshow(binaryImg)

  # Initial level segmentation on Tables
  outer_rectangles, child_contours = SWM.getContours(binaryImg)

  # Get Sections splitted
  imgCode_Title = SWM.getTableImg(binaryImg, outer_rectangles, extract=1)
  imgAssessment = SWM.getTableImg(binaryImg, outer_rectangles, extract=2)
  imgRest = SWM.getTableImg(binaryImg, outer_rectangles, extract=-1)


  # Get Course Code
  courseCode = SWM.get_courseCode(imgCode_Title, outer_rectangles, child_contours, \
                                  word_model, SWM.findWord, word_model_classes)
  # Get Course Title
  courseTitle = SWM.get_courseTitle(imgCode_Title, outer_rectangles, child_contours, \
                                    word_model, SWM.findWord, word_model_classes)

  # Get Course Assessment
  courseAssessment = SWM.get_courseAssessment(imgAssessment, outer_rectangles, child_contours, \
                                              word_model, SWM.findWord, word_model_classes)
  if courseCode is not None or courseCode != '' :
    courseAssessment = transform_assessment(courseAssessment)

  # Get Remaining Doc
  docWord = SWM.get_Doc(imgRest, word_model, SWM.findWord, word_model_classes)

  if courseCode is None or courseCode == '' :
    courseCode, level_credit_pos = POM.get_courseCode(' '.join(docWord))
  if courseTitle is None or courseTitle == '':
    courseTitle = POM.get_courseTitle(' '.join(docWord), courseCode, level_credit_pos)
  if courseAssessment is None or len(courseAssessment[0]) == 0:
    courseAssessment = POM.get_courseAssessments('\n'.join(docWord))

  # Get Course Level & Credit
  courseLevel, courseCredit = POM.get_owr_courseLevelAndCredid(docWord)

  # Get Course Tutor Directed Learning Hours & Self Directed Learning Hours
  courseTutorDirected, courseSelfDirected = POM.get_owr_courseTutorAndSelfHrs(docWord)

  # Get Course Aim
  courseAim = POM.get_owr_aim(docWord)

  # Get Course Pre-Requisite
  coursePreRequisite = POM.get_owr_prerequisite(docWord)

  # Get Course Learning Outcome
  courseLearningOutcome = POM.get_owr_courseLearningOutcome(docWord)

  # Get Course Completion
  courseCompletion = POM.get_owr_courseCompletion(docWord)

  printParam(output_path,'INPUT_OWR_CNN_CD_V3.txt',page)

  # DB Course Details
  dbCourseID = DMM.insert_course(dbConnection, courseCode, courseTitle, courseCredit, courseLevel, courseSelfDirected, courseTutorDirected, courseAim, dbVerID, int(page))

  # DB Course PreRequisite
  for preReq in coursePreRequisite:
    DMM.insert_prerequisite(dbConnection, preReq[0], preReq[1], dbCourseID)

  # DB Learning Outcomes
  for learningOutcome in courseLearningOutcome:
    DMM.insert_outcome(dbConnection, learningOutcome[0], learningOutcome[1], dbCourseID)

  # DB Completion
  for comopletion in courseCompletion:
    DMM.insert_completion(dbConnection, comopletion, dbCourseID)

  # DB Assessment
  for i in range(len(courseAssessment[0])):
    method = courseAssessment[0][i]
    weight = courseAssessment[1][i]
    learning = courseAssessment[2][i]

    dbAssessmentID = DMM.insert_assessment(dbConnection, method, weight, dbCourseID)
    #print(method, weight, learning)
    #dbAssessmentID = insert_assessment(dbConnection, method, weight, dbCourseID)

    for learn in learning:
      DMM.insert_assessment_outcome(dbConnection, dbAssessmentID, learn)




Enter 1 for skewed and tilt images ('input') folder, 2 for Original images ('tmp_input') folder: 1
Enter 1 for OpenCV preprocessing, 2 for CNN based preprocessing or 3 for NO preprocessing: 2


In [23]:

# Final update for index
null_prerequisite = DMM.get_null_prerequisites(dbConnection)
#prerequisiteID, code, title, courseID

for preReq in null_prerequisite:
  tmpVersionID = DMM.get_versionID_by_courseID(dbConnection, preReq['courseID'])
  #tmpVersionID = get_versionID_by_courseID(dbConnection, preReq['courseID'])
  if tmpVersionID == dbVerID:
    tmpCourseID = DMM.get_courseID_by_criteria(dbConnection, preReq['code'], preReq['title'], dbVerID)

    if tmpCourseID is not None:
      DMM.update_prerequisite_courseID(dbConnection, preReq['prerequisiteID'],  tmpCourseID)


# Update Learning Outcomes
null_learning_outcome = DMM.find_assessments_with_null_outcome(dbConnection)
#a.assessmentID, ao.outcomeID_value, a.courseID

for outcomes in null_learning_outcome:
  tmpVersionID = DMM.get_versionID_by_courseID(dbConnection, outcomes['courseID'])
  if tmpVersionID == dbVerID:
    tmpOutcomeID = DMM.get_outcomeID_by_index_and_course(dbConnection, outcomes['outcomeID_value'], outcomes['courseID'])

    if tmpOutcomeID is not None:
      DMM.update_outcomeID(dbConnection, outcomes['assessmentID'], outcomes['outcomeID_value'], tmpOutcomeID)

No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.
No matching record found.


In [24]:
DMM.set_connection_close(dbConnection)

In [19]:
#docWord

In [20]:
#from nltk import word_tokenize

