# Dependencies

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sb
import datetime as dt

import nltk
import re
import random
import json

# Text preprocessing

We use a test corpus consisting of three different courses: 
10031, 01005 and 23932.

A document in the corpus is 'General course objectives' + 'Learning objectives' + 'Content'.

In [2]:
# list of tuple(course, path)
raw_text_paths = [('01005', 'documents_test/01005.txt'), ('23932', 'documents_test/23932.txt'), ('10031', 'documents_test/10031.txt')]

In [3]:
raw_text_dict = {}

for course, raw_text_path in raw_text_paths:
    with open(raw_text_path, 'r') as file:
        raw_text_dict[course] = [file.read().replace('\n', ' ')]

courses = pd.DataFrame.from_dict(raw_text_dict, orient='index', columns=['text'])

courses

Unnamed: 0,text
1005,The course content is the mathematical basis f...
23932,"The students should, based on the cell, acquir..."
10031,The overall goal is to give Physics and Nanote...


## Tokenizer

In [4]:
# Stop words are removed from the tokenized text. We get the stop words here.
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words = [re.sub('[^a-z_-]', '', stop_word) for stop_word in stop_words]

def tokenize(text: str) -> list[str]:
    """Tokenizer function. 
    The following is removed from the tokenized text:
    Symbols, stop words.
    
    Finally, only unique words are included in the tokens (ie. no repeats)"""
    text = text.lower()
    text = re.sub('[^a-z]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
   
    return tokens

In [5]:
# The tokenized text is saved in a new column
courses['tokens'] = courses.apply(lambda row: tokenize(row['text']), axis=1)
courses.head()

Unnamed: 0,text,tokens
1005,The course content is the mathematical basis f...,"[course, content, mathematical, basis, broad, ..."
23932,"The students should, based on the cell, acquir...","[students, based, cell, acquire, basic, unders..."
10031,The overall goal is to give Physics and Nanote...,"[overall, goal, give, physics, nanotechnology,..."


In [8]:
# Create a corpus wich is a dictionary of the tokenized texts
corpus = courses['tokens'].to_dict()

# TF-IDF analysis

In [6]:
def term_frequency(document: list) -> tuple[np.array, np.array]:
    """This function calculates the term frequency of a document.
    The term frequency is normalized by the total number of words in the document.
    Returns a tuple of arrays. The first array is the terms and the second array is
    the term frequency in the same order.
    
    Arguments:
        document (np.array): array of tokens.
    """
    fdist = nltk.FreqDist(document)
    n_words = fdist.N()
    terms = np.array(list(fdist.keys()))
    tf = np.array(list(fdist.values())) / n_words
    
    return tf, terms


def inverse_document_frequency(documents: dict[list], corpus=None) -> dict[float]:
    """This function computes the inverse document frequency (idf) given
    a list of documents and a set of terms.
    """
    if corpus is None:
        corpus = documents.keys()
    
    n_documents = len(corpus)
    terms, counts = np.unique(
        np.concatenate([list(set(documents[d])) for d in corpus]),
        return_counts=True
    )
    
    idf = np.log(n_documents / (counts))
    idf_lookup = {term: w for term, w in zip(terms, idf)}
    
    return idf_lookup

def tf_idf(document: list, idf_lookup: dict[float]) -> tuple[np.array, np.array]:
    """Computes the tf-idf of a document, given a document and a lookup table for idf."""
    tf, terms = term_frequency(document)
    idf = np.array([idf_lookup[term] for term in terms])
    
    return tf * idf, terms

In [11]:
# Top 5 words, TF
print('Top 5 terms sorted by term frequency (TF):')

for course, document in corpus.items():
    tf, terms = term_frequency(document)
    sorted_indices = np.argsort(tf)[::-1]
    stock_top_5 = terms[sorted_indices[:5]]
    print(f'{course}:')
    print(stock_top_5)

Top 5 terms sorted by term frequency (TF):
01005:
['use' 'linear' 'equations' 'functions' 'applications']
23932:
['biological' 'cell' 'describe' 'structure' 'students']
10031:
['physics' 'sections' 'well' 'technology' 'within']


In [12]:
idf_lookup = inverse_document_frequency(corpus)

In [14]:
# Top 10 words, TF
print('Top 10 terms sorted by term frequency (TF):')

topx = 10
for course, document in corpus.items():
    tf, terms = term_frequency(document)
    sorted_indices = np.argsort(tf)[::-1]
    stock_top_10 = terms[sorted_indices[:topx]]
    # print(tf[sorted_indices[:topx]])
    print(f'{course}:')
    print(stock_top_10)
    
print('\n')

# Top 10 words, TF-IDF
print('Top 10 terms sorted by TF-IDF:')

for course, document in corpus.items():
    tfidf, terms = tf_idf(document, idf_lookup)
    sorted_indices = np.argsort(tfidf)[::-1]
    stock_top_10 = terms[sorted_indices[:topx]]
    # print(tfidf[sorted_indices[:topx]])
    print(f'{course}:')
    print(stock_top_10)

Top 10 terms sorted by term frequency (TF):
01005:
['use' 'linear' 'equations' 'functions' 'applications' 'elementary'
 'mathematical' 'systems' 'complex' 'differential']
23932:
['biological' 'cell' 'describe' 'structure' 'students' 'basic' 'processes'
 'explain' 'regulation' 'mechanisms']
10031:
['physics' 'sections' 'well' 'technology' 'within' 'subjects' 'laboratory'
 'work' 'data' 'engineering']


Top 10 terms sorted by TF-IDF:
01005:
['linear' 'equations' 'elementary' 'applications' 'differential'
 'mathematical' 'systems' 'complex' 'mathematics' 'vector']
23932:
['biological' 'cell' 'describe' 'basic' 'processes' 'regulation'
 'mechanisms' 'living' 'energy' 'pro']
10031:
['physics' 'sections' 'subjects' 'data' 'group' 'ultrafast' 'optical'
 'pulses' 'hydrodynamics' 'lab']
