# **Automating Course Descriptor**

In [1]:
# do not run when we import this package as module
if __name__ == '__main__':
  # Connect Google Drive
  from google.colab import drive
  drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
if __name__ == '__main__':
  ## Load all ipynb files from Google Drive to Colab environment
  import os
  import shutil

  def find_and_copy_files(src_folder, dest_folder, file_extension):
    for foldername, subfolders, filenames in os.walk(src_folder):
        for filename in filenames:
            if filename.endswith(file_extension):
                src_file = os.path.join(foldername, filename)
                dest_file = os.path.join(dest_folder, filename)
                try:
                    shutil.copy2(src_file, dest_file)  # Use shutil.copy if you don't need metadata
                    print(f"Copied: {src_file} to {dest_file}")
                except Exception as e:
                    print(f"Error copying {src_file}: {e}")

In [3]:
if __name__ == '__main__':
  src_folder = '/content/gdrive/MyDrive/OWR/source'
  dest_folder = '/content'
  file_extension = '.ipynb'

  # Call the function to find and copy files
  find_and_copy_files(src_folder, dest_folder, file_extension)

Copied: /content/gdrive/MyDrive/OWR/source/mysql/owr_data_model_v1.ipynb to /content/owr_data_model_v1.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/input_generator/owr_input_generator.ipynb to /content/owr_input_generator.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/owr_model_v2.ipynb to /content/owr_model_v2.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/owr_segmentation_v3.ipynb to /content/owr_segmentation_v3.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_CNN_classification.ipynb to /content/owr_preprocess_skew_CNN_classification.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_CNN_regression.ipynb to /content/owr_preprocess_skew_CNN_regression.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess_skew_cv.ipynb to /content/owr_preprocess_skew_cv.ipynb
Copied: /content/gdrive/MyDrive/OWR/source/owr_model/preprocessing/owr_preprocess

In [4]:
if __name__ == '__main__':
  from IPython.display import clear_output as cls
  !pip install import_ipynb
  import import_ipynb

  cls()

In [5]:
if __name__ == '__main__':
  # Import Colab Models
  import owr_postprocess_module as POM
  import owr_data_model_v1 as DMM
cls()

In [6]:
# Install PyPDF2
!pip install PyPDF2
cls()

In [7]:
# Import Libraries
import PyPDF2
import numpy as np
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from nltk import word_tokenize
import re
import mysql
import mysql.connector as msql
from mysql.connector import Error
from datetime import datetime

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Global Parameters
## Ground Truth Parameter details
ground_truth_file = '/content/gdrive/MyDrive/OWR/data/files/2023 Programme Handbook.pdf'
output_path = '/content/gdrive/MyDrive/OWR/log/printLogs/'
start_page = 40
end_page   = 130
exception_pages = [92, 119]

In [9]:
# variable decelerations
courseCode = ''
level_credit_pos = 0
courseTitle = ''
courseLevel = 0
courseCredit = 0
courseTutorDirected = 0
courseSelfDirected = 0
courseAim = ''
coursePreRequisite = []
courseLearningOutcome = []
courseAssessment = []
courseCompletion = []


In [10]:
def paramInitialise():
  courseCode = ''
  level_credit_pos = 0
  courseTitle = ''
  courseLevel = 0
  courseCredit = 0
  courseTutorDirected = 0
  courseSelfDirected = 0
  courseAim = ''
  coursePreRequisite = []
  courseLearningOutcome = []
  courseAssessment = []
  courseCompletion = []

In [11]:
def printParam(path, file_name, page_no):

  with open(path + file_name, 'a') as f:
    print('PageNo', page_no, file=f)
    print("Course Code:", courseCode, file=f)
    print("Course Title:", courseTitle, file=f)
    print("Course Level:", courseLevel, file=f)
    print("Course Credit:", courseCredit, file=f)
    print("Course Tutor Directed Learning Hours:", courseTutorDirected, file=f)
    print("Course Self-Directed Learning Hours:", courseSelfDirected, file=f)
    print("Course Aim:", courseAim, file=f)
    print("Course Pre-Requisite:", coursePreRequisite, file=f)
    print("Course Learning Outcome:", courseLearningOutcome, file=f)
    print("Course Assessment:", courseAssessment, file=f)
    print("Course Completion:", courseCompletion, file=f)
    print('-'*100, file=f)

In [12]:
def get_pdfReader(pdf_file, start, end, exception):
  # Number of pages to be read
  no_of_pages = np.arange(start, end+ 1, dtype=int)

  mask = ~np.isin(no_of_pages, exception)

  no_of_pages = no_of_pages[mask]

  # Open Ground Truth File
  gt_pdf_file = open(pdf_file, 'rb')

  # Create a PDF reader object
  gt_pdf_reader = PyPDF2.PdfReader(gt_pdf_file)

  return gt_pdf_reader, no_of_pages

In [13]:
def get_pageContent(pdf_reader, page_no):
  ''' Function to read and extract text
      Input:
              1. pdf reader
              2. Page number
      Returns text from the requested page'''

  page = pdf_reader.pages[int(page_no)]
  text = page.extract_text().upper()

  return text

In [14]:
pdfReader, pages = get_pdfReader(ground_truth_file, start_page, end_page, exception_pages)

dbConnection = DMM.get_db_connection('whitireia_admin', 'weltec#2023')

In [15]:
# Create Base Version
dbVerID = DMM.insert_version(dbConnection, 'GROUND_TRUTH_V1', 'BASE VERSION 1 FOR GROUND TRUTH FROM PDF EXTRACTION')

for page in pages:
#for page in range(39, 40):
  paramInitialise()
  pageContent = get_pageContent(pdfReader, page)
  pageContent = pageContent.replace('.', ' ')

  courseCode, level_credit_pos = POM.get_courseCode(pageContent)
  courseTitle = POM.get_courseTitle(pageContent, courseCode, level_credit_pos)
  courseLevel = POM.get_courseLevel(pageContent, level_credit_pos)
  courseCredit = POM.get_courseCredit(pageContent, level_credit_pos)
  courseTutorDirected = POM.get_courseTutorHrs(pageContent)
  courseSelfDirected = POM.get_courseSelfHrs(pageContent)
  courseAim = POM.get_courseAim(pageContent)
  coursePreRequisite = POM.get_coursePrerequisite(pageContent, courseCredit, courseTutorDirected)
  courseLearningOutcome = POM.get_courseLearningOutcome(pageContent)
  courseAssessment = POM.get_courseAssessments(pageContent)
  courseCompletion = POM.get_courseCompletion(pageContent)

  printParam(output_path,'goundTruth.txt',page)


  # DB Course Details
  dbCourseID = DMM.insert_course(dbConnection, courseCode, courseTitle, courseCredit, courseLevel, courseSelfDirected, courseTutorDirected, courseAim, dbVerID, int(page))

  # DB Course PreRequisite
  for preReq in coursePreRequisite:
    DMM.insert_prerequisite(dbConnection, preReq[0], preReq[1], dbCourseID)

  # DB Learning Outcomes
  for learningOutcome in courseLearningOutcome:
    DMM.insert_outcome(dbConnection, learningOutcome[0], learningOutcome[1], dbCourseID)

  # DB Completion
  for comopletion in courseCompletion:
    DMM.insert_completion(dbConnection, comopletion, dbCourseID)

  # DB Assessment
  for i in range(len(courseAssessment[0])):
    method = courseAssessment[0][i]
    weight = courseAssessment[1][i]
    learning = courseAssessment[2][i]

    dbAssessmentID = DMM.insert_assessment(dbConnection, method, weight, dbCourseID)
    #print(method, weight, learning)
    #dbAssessmentID = insert_assessment(dbConnection, method, weight, dbCourseID)

    for learn in learning:
      DMM.insert_assessment_outcome(dbConnection, dbAssessmentID, learn)




In [16]:

# Final update for index
null_prerequisite = DMM.get_null_prerequisites(dbConnection)
#prerequisiteID, code, title, courseID

for preReq in null_prerequisite:
  tmpVersionID = DMM.get_versionID_by_courseID(dbConnection, preReq['courseID'])
  #tmpVersionID = get_versionID_by_courseID(dbConnection, preReq['courseID'])
  if tmpVersionID == dbVerID:
    tmpCourseID = DMM.get_courseID_by_criteria(dbConnection, preReq['code'], preReq['title'], dbVerID)

    if tmpCourseID is not None:
      DMM.update_prerequisite_courseID(dbConnection, preReq['prerequisiteID'],  tmpCourseID)


# Update Learning Outcomes
null_learning_outcome = DMM.find_assessments_with_null_outcome(dbConnection)
#a.assessmentID, ao.outcomeID_value, a.courseID

for outcomes in null_learning_outcome:
  tmpVersionID = DMM.get_versionID_by_courseID(dbConnection, outcomes['courseID'])
  if tmpVersionID == dbVerID:
    tmpOutcomeID = DMM.get_outcomeID_by_index_and_course(dbConnection, outcomes['outcomeID_value'], outcomes['courseID'])

    if tmpOutcomeID is not None:
      DMM.update_outcomeID(dbConnection, outcomes['assessmentID'], outcomes['outcomeID_value'], tmpOutcomeID)



No matching record found.


In [17]:
DMM.set_connection_close(dbConnection)