## GradeReader

This program was created to automatically read and process grades submitted by students from their university or universities to their student appartment land lord. The land lords require the students to pass a minimum of 15 HP (högskolepoäng) each semester to be allowed to stay in their student apartment (this is to prevent non-students from living in the appartments).

Currently the land lords have to manually check the records provided by the students in pdf-file. This program reads the pdf-files, process the file to sum up the grades and check if the student meet the requirement. This program is a part of a larger administration software-system for the land lord.

In [79]:
import pandas as pd
import numpy as np
import re
import datetime
import os


In [373]:
arr = os.listdir()
print(arr)

['BoAdmin.ipynb', 'GradeReader.py.zip', 'GradeReader.py', 'R”da koset.pdf', 'studieintygtrafiklararutbildning.pdf', 'Untitled.ipynb', 'sodertorn2.pdf', 'sthlmuni2.pdf', 'resuktatintyg.pdf', 'Stockholms universitet.pdf', 'sthlmuni3.pdf', 'kth.pdf', 'sthlmuni4.pdf', 'sthlmuni5.pdf', 'sthlmuni6.pdf', '.ipynb_checkpoints', 'sodertorn.pdf', 'Intyg.pdf', 'Uppsala.pdf', 'Uppsalauni.pdf', 'GIH.pdf', 'Stockholms universitet2.pdf', 'sthlmuni.pdf', 'Stockholms universitet22.pdf']


In [465]:
path = 'sodertorn2.pdf'

In [466]:
# Import Tika PDF parser
from tika import parser

# Read PDF-file
try:
    rawText = parser.from_file(path)

    rawList = rawText['content'].splitlines()
except:
    print('Ingen PDF-fil')

In [467]:
# Define approved grades
approved_grades = ['A', 'B', 'C', 'D', 'E', 'G', 'VG', 'MVG', 'AB', 'BA', '3', '4', '5', 'P']

# Set required hp
points_req = 15

# Set date interval
# 1 september -> 1 februari
start_date = '2019-09-1'
end_date = '2020-02-01'

def decode_ladok_extract(rawList):
    # Decode extracts from Ladok (containing all of the students records from all universites)
    completed_courses = []
    prev_line = 'None'
    prev2_line = 'None'

    for line in rawList:
        if line != '': # skip empty lines
            if bool(re.search(r'20',  line)): # check for lines containing 20, indicating a date
                splitted = line.split()
                try:
                    if splitted[-2] in approved_grades: # check if line contains a grade in the correct position
                        course_name = ' '.join(splitted[0:-4])
                        if len(course_name) < 1:
                            course_name = prev2_line + ' ' + prev_line
                        course_points = splitted[-3].strip('()')
                        course_complete_date = splitted[-1]
                        completed_courses.append([course_name, course_points, course_complete_date])
                    if splitted[-3] in approved_grades and ')' in splitted[-4]: # check if line contains a grade in the correct position
                        course_name = ' '.join(splitted[0:-5])
                        if len(course_name) < 1:
                            course_name = prev2_line + ' ' + prev_line
                        course_points = splitted[-4].strip('()')
                        course_complete_date = splitted[-2]
                        completed_courses.append([course_name, course_points, course_complete_date])
                except:
                    continue
            prev2_line = prev_line.strip()
            prev_line = line.strip()
            
    df = pd.DataFrame(completed_courses, columns=['Object', 'HP', 'Date'])
    df['HP'] = pd.to_numeric(df['HP'])
    df['Date'] = pd.to_datetime(df['Date'])
    
    approval = calculate_approval(df)
    
    return approval

def decode_uni_extract(rawList):
    # Decode extract from students university
    completed_courses = []
    avklarade = 0
    ej_avklarade = 0
    prev_line = 'None' # keep track of previous
    prev2_line = 'None' # keep track of the line before the previous line
    
    rawList_reduced = rawList[rawList.index('Resultatintyg'):]

    for line in rawList_reduced:
        if bool(re.search('Avklarade',  line)):
            avklarade = 1
            ej_avklarade = 0
        if bool(re.search('slutrapporterade', line)):
            avklarade = 0
            ej_avklarade = 1
        # check for completed hp in completed courses
        if line != '' and avklarade == 1 and ej_avklarade == 0:
            if bool(re.search(r'20',  line)):
                splitted = line.split()
                if splitted[-3] in approved_grades and '(' in splitted[-5]:
                    course_name = ' '.join(splitted[1:-5])
                    if len(course_name) < 1:
                        course_name = prev2_line + ' ' + prev_line
                    course_points = splitted[-5].strip('()').replace(',','.')
                    course_complete_date = splitted[-2].replace('\xad', '-')
                    completed_courses.append([course_name, course_points, course_complete_date])
            prev2_line = prev_line.strip()
            prev_line = line.strip()
        
        # check for completed hp in NON-completed courses
        if line != '' and avklarade == 0 and ej_avklarade ==1:
            if bool(re.search(r'20',  line)):
                splitted = line.split()
                if splitted[-3] in approved_grades:
                    course_name = ' '.join(splitted[1:-5])
                    if len(course_name) < 1:
                        course_name = prev2_line + ' ' + prev_line
                    course_points = splitted[-5].strip('()').replace(',','.')
                    course_complete_date = splitted[-2].replace('\xad', '-')
                    completed_courses.append([course_name, course_points, course_complete_date])
            prev2_line = prev_line.strip()
            prev_line = line.strip()
    
    # create dataframe of completed courses
    df = pd.DataFrame(completed_courses, columns=['Object', 'HP', 'Date'])
    df['HP'] = pd.to_numeric(df['HP'])
    df['Date'] = pd.to_datetime(df['Date'])
    
    approval = calculate_approval(df)
    
    return approval

def calculate_approval(df):
    df = df[(df['Date'] > start_date) & (df['Date'] < end_date)]

    if df.HP.sum() > points_req:
        approval = 'Yes'
    else:
        approval = 'No'

    print(df)
    print('Total HP during period:', df.HP.sum())

    print('Approved?', approval)
    
    return approval

# Check what type of student record to process
if 'Nationellt studieintyg' in rawList and 'Sveriges högskolor och universitet' in rawList:
    print('Ladok extract')
    decode_ladok_extract(rawList)
elif 'Registreringsintyg' in rawList:
    print('Endast nationella studieintyg eller resultatintyg godkänns, '
          'du har skickat in ett registreringsintyg.')
elif 'Resultatintyg' in rawList:
    print('Uni extract')
    decode_uni_extract(rawList)
else:
    print('Ogiltig fil')
                
    

Uni extract
                                          Object   HP       Date
0        Redovisning och beskattning, seminarier  1.5 2019-11-04
1  Samhälle, marknad och företagande, seminarier  1.5 2019-12-04
2                    Ekonomistyrning, seminarier  1.5 2020-01-14
3    Samhälle, marknad och företagande, tentamen  6.0 2020-01-30
Total HP during period: 10.5
Approved? No
