## Importing libraries 

In [None]:
import os
import re
from collections import Counter


### 1.1 Examining and loading data
- xamine the data folder, including the categories and job advertisment txt documents, etc. Explain your findings here, e.g., number of folders and format of txt files, etc.
- Load the data into proper data structures and get it ready for processing.
- Extract webIndex and description into proper data structures.


In [10]:
# This will give path to the data folder
data_folder = 'data/'

# This portion will load stopwords_en.txt
with open('stopwords_en.txt', 'r', encoding='utf-8') as file:
    stopwords = set(file.read().splitlines())

### 1.2 Pre-processing data
Perform the required text pre-processing steps.

...... Sections and code blocks on basic text pre-processing


<span style="color: red"> You might have complex notebook structure in this section, please feel free to create your own notebook structure. </span>

In [13]:
# This is the regular expression for tokenization as per given
token_pattern = re.compile(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")

# This function will preprocess a single job description
def preprocess_description(text):
    tokens = token_pattern.findall(text)
    tokens = [token.lower() for token in tokens if len(token) >= 2]
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

# Extract descriptions and preprocess them
documents = []
all_tokens = []

for category in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category)
    if os.path.isdir(category_path):
        for job_file in os.listdir(category_path):
            job_path = os.path.join(category_path, job_file)
            with open(job_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()
                # Assume the description starts after a certain line (e.g., third line)
                description = '\n'.join(text.split('\n')[2:])
                tokens = preprocess_description(description)
                documents.append((job_file, tokens))
                all_tokens.extend(tokens)

# This will count term frequencies
term_freq = Counter(all_tokens)

# This will remove words wich appears only once
documents = [(job_file, [token for token in tokens if term_freq[token] > 1]) for job_file, tokens in documents]

# This will calculate document frequencies
doc_freq = Counter()
for job_file, tokens in documents:
    unique_tokens = set(tokens)
    for token in unique_tokens:
        doc_freq[token] += 1

# This will extract top 50 mostfrequent words
top_50_words = set(
    [word for word, freq in doc_freq.most_common(50)]
)

# This will remove the extracted top 50 most frequent words
documents = [
    (job_file, [token for token in tokens if token not in top_50_words])
    for job_file, tokens in documents
]

# This will build vocabulary
vocab = sorted({token for job_file, tokens in documents for token in tokens})
vocab_dict = {word: idx for idx, word in enumerate(vocab)}

print("Program run")

Program run


## Saving required outputs
Save the vocabulary, bigrams and job advertisment txt as per spectification.
- vocab.txt

In [15]:
# This will save vocabulary
with open('vocab.txt', 'w', encoding='utf-8') as file:
    for word, idx in vocab_dict.items():
        file.write(f"{word}:{idx}\n")

# This will save preprocessed job advertisements requirement for task 2_3
with open('preprocessed_jobs.txt', 'w', encoding='utf-8') as file:
    for job_file, tokens in documents:
        file.write(f"#{job_file},{' '.join(tokens)}\n")

print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 5283


## Summary


In [None]:
def generate_summary(sentence_scores, top_n=5):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)
    summary = ' '.join([sentence for sentence, score in sorted_sentences[:top_n]])
    return summary