**Description n-gram frequency & tf-idf**

By: PodiPeti

In [31]:
import pandas as pd
from collections import Counter
from nltk import bigrams, trigrams
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

INPUT

In [32]:
df = pd.read_csv('input/preprocessed_jobs_all.csv')                 # tokenized, lemmatized, replaced, filtered

GENERATING N-GRAMS

In [33]:
def get_ngrams(description, n):
    if n == 1:
        return [word for title in description for word in title.split()]
    elif n == 2:
        return list(chain.from_iterable(bigrams(title.split()) for title in description))
    elif n == 3:
        return list(chain.from_iterable(trigrams(title.split()) for title in description))
    else:
        raise ValueError("n should be 1, 2, 3")
    


TF-IDF & SAVE TO EXCEL

In [34]:
def save_tfidf_and_freq_to_excel(descriptions, ngram_type, n, excel_writer, level):
    # Create a TfidfVectorizer
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(n, n))
    
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    feature_names = vectorizer.get_feature_names_out()

    # frequencies
    ngrams = get_ngrams(descriptions, n)
    freqs = Counter(ngrams)

    # Sum the TF-IDF scores for each n-gram across all documents
    summed_tfidf = np.sum(tfidf_matrix, axis=0)
    scores = []
    for word, idx in vectorizer.vocabulary_.items():
        word_ngram = tuple(word.split()) if n > 1 else word
        freq = freqs[word_ngram]
        score = summed_tfidf[0, idx]
        scores.append((word, score, freq))
    
    # Create a DataFrame, sort, and take top 50
    df = pd.DataFrame(scores, columns=[ngram_type, 'TF-IDF Score', 'Frequency'])
    df.sort_values(by='TF-IDF Score', ascending=False, inplace=True)
    top_50_df = df.head(50)  # Get the top 50 rows
    top_50_df.to_excel(excel_writer, sheet_name=f'{ngram_type}_top_50_tfidf_freq', index=False)

PROCESS AND CALL FUNCTIONS

In [35]:
unique_levels = df['level'].unique()
for level in unique_levels:
    level_df = df[df['level'] == level]
    excel_file_path = f'output/data/{level}_description_top_50_ngrams_tfidf_freq.xlsx'
    with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as excel_file:
        for ngram_type, n in [('unigram', 1), ('bigram', 2), ('trigram', 3)]:
            save_tfidf_and_freq_to_excel(level_df['description'], ngram_type, n, excel_file, level)