**Job description n-gram frequency**

By: PodiPeti

In [6]:
import pandas as pd
import json
from collections import Counter
from nltk import bigrams, trigrams
from itertools import chain

INPUT

In [7]:
df = pd.read_csv('input/preprocessed_jobs_all.csv')                 # tokenized, lemmatized, replaced, filtered

with open('input/style.json', 'r') as json_file:
    style = json.load(json_file)

GENERATING N-GRAMS

In [8]:
def get_ngrams(description, n):
    if n == 1:
        return [word for title in description for word in title.split()]
    elif n == 2:
        return list(chain.from_iterable(bigrams(title.split()) for title in description))
    elif n == 3:
        return list(chain.from_iterable(trigrams(title.split()) for title in description))
    else:
        raise ValueError("n should be 1, 2, 3")
    


SAVE TO EXCEL

In [9]:
def save_excel(frequencies, ngram_type, excel_writer):
    df = pd.DataFrame(frequencies.items(), columns=[ngram_type, 'Frequency'])
    if ngram_type != 'unigram':
        df[ngram_type] = df[ngram_type].apply(lambda x: ' '.join(x))
    df.sort_values(by='Frequency', ascending=False, inplace=True)
    df.to_excel(excel_writer, sheet_name=f'{ngram_type}', index=False)

PROCESS AND CALL FUNCTIONS

In [10]:
excel_file = pd.ExcelWriter('output/data/description_freq_ngrams_all.xlsx', engine='xlsxwriter')

for ngram_type, n in [('unigram', 1), ('bigram', 2), ('trigram', 3)]:
    ngrams = get_ngrams(df['description'], n)
    ngram_freq = Counter(ngrams)

    ngram_freq = {gram: freq for gram, freq in ngram_freq.items() if freq >= 50}

    # Save frequencies to Excel
    save_excel(ngram_freq, ngram_type, excel_file)

excel_file.close()