NER Tagger: https://tecoholic.github.io/ner-annotator/

Aim to tag 300 documents: 150 JDs and 150 Module Descriptions

# Gathering Module Descriptions

In [1]:
import pandas as pd
import os
import random

In [2]:
UNI_MODDESC_MAPPING = {
    "nus_dsa_mods.xlsx" : "mod_desc",
    "NTU_course_info.csv" : "Course Aims",
    "SMU_course_info.csv" : "Description",
    "SUSS_course_info.csv" : "module description",
    "SUTD_course_info.csv" : "Module description",
    "SIT_Module_Info.csv" : "Module Description "
}

SKIP_ROWS = {
    "nus_dsa_mods.xlsx" : 0,
    "NTU_course_info.csv" : 0,
    "SMU_course_info.csv" : 0,
    "SUSS_course_info.csv" : 0,
    "SUTD_course_info.csv" : 5,
    "SIT_Module_Info.csv" : 0
}

In [3]:
READ_FOLDER = "../../../Data/university_courses/"

In [4]:
mod_descriptions = pd.Series([], dtype='object')
for uni, description_col in UNI_MODDESC_MAPPING.items():
    print(f"Gathering module descriptions from {uni}")
    try:
        table = pd.read_excel(READ_FOLDER + uni, skiprows=SKIP_ROWS[uni])
    except:
        table = pd.read_csv(READ_FOLDER + uni, skiprows=SKIP_ROWS[uni], encoding_errors='ignore')
    
    table_desc = table[UNI_MODDESC_MAPPING[uni]].dropna().reset_index(drop=True)
    mod_descriptions = pd.concat([mod_descriptions, table_desc]).reset_index(drop=True)
    
display(mod_descriptions)

Gathering module descriptions from nus_dsa_mods.xlsx
Gathering module descriptions from NTU_course_info.csv
Gathering module descriptions from SMU_course_info.csv
Gathering module descriptions from SUSS_course_info.csv
Gathering module descriptions from SUTD_course_info.csv
Gathering module descriptions from SIT_Module_Info.csv


0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This module is a first course in linear algebr...
3      This is a course in single-variable calculus. ...
4      This module introduces students to the design ...
                             ...                        
173    Students will be grouped into teams of 5-6 and...
174    To keep up-to-date with the advances in techno...
175    Students will undertake an eight-month Integra...
176    This is a major individual project that is to ...
177    This module will endow students with the under...
Length: 178, dtype: object

In [5]:
# Select 150 random module descriptions
SAMPLE_SIZE = 150
indices = sorted(random.sample(range(len(mod_descriptions)), SAMPLE_SIZE))
mod_descriptions = mod_descriptions.iloc[indices].reset_index(drop=True)
display(mod_descriptions)

0      This module introduces the fundamental concept...
1      The abundance of data being harvested from var...
2      This is a course in single-variable calculus. ...
3      This module aims at introducing basic concepts...
4      This module applies advanced calculus to pract...
                             ...                        
145    The students will be introduced to a number of...
146    Students will be grouped into teams of 5-6 and...
147    To keep up-to-date with the advances in techno...
148    Students will undertake an eight-month Integra...
149    This module will endow students with the under...
Length: 150, dtype: object

# Gathering Job Descriptions

In [6]:
READ_FOLDER = "C:/Users/ernest.liu/Documents/git/dsa3101-2220-12-ds/Backend/Data/jobs/"
job_files = [file_name for file_name in os.listdir(READ_FOLDER) if ".csv" in file_name]
job_files

['jobstreet_query-data_analyst.csv',
 'jobstreet_query-data_engineer.csv',
 'jobstreet_query-data_science.csv',
 'jobstreet_query-machine_learning_engineer.csv',
 'mycareersfuture_query-data_analyst.csv',
 'mycareersfuture_query-data_engineer.csv',
 'mycareersfuture_query-data_science.csv',
 'mycareersfuture_query-machine_learning_engineer.csv']

In [7]:
job_descriptions = pd.Series([], dtype='object')
for job_file in job_files:
    table = pd.read_csv(READ_FOLDER + job_file)
    table_desc = table['Description'].dropna().reset_index(drop=True)
    job_descriptions = pd.concat([job_descriptions, table_desc]).reset_index(drop=True)

In [8]:
# Select 150 random job descriptions
SAMPLE_SIZE = 150
indices = sorted(random.sample(range(len(job_descriptions)), SAMPLE_SIZE))
job_descriptions = job_descriptions.iloc[indices].reset_index(drop=True)
display(job_descriptions)

0      Overall Job Purpose: Provide expertise to anal...
1      Responsibilities Job Description Ecommerce's G...
2      Responsibilities About TikTok TikTok is the le...
3      Inventory Data Analyst Job summary: Buyer key ...
4      This is an exciting opportunity to work for on...
                             ...                        
145    Participate in the design, development, and te...
146    Investment Insights Group (IIG)\r\nThe Investm...
147    Job Description:\r\n1. Coordinating with team ...
148    Benefits Summary:\r\n• Engineering Industry\r\...
149    Cialfo is a leading platform connecting studen...
Length: 150, dtype: object

# Combining both descriptions

In [14]:
SAVE_PATH = "C:/Users/ernest.liu/Documents/git/dsa3101-2220-12-ds/Backend/Data/NER_annotated_data/Job_Mod_Descriptions/"
SAVE_FILE_NAME = "job_mod_desc-unlabelled.txt"

In [18]:
with open(SAVE_PATH + SAVE_FILE_NAME, "w", encoding='utf-8') as f:
    for description in pd.concat([mod_descriptions, job_descriptions]):
        f.write(description)
        f.write("\nNEXT ENTRY\n")