**Job description dictionary frequency TF-IDF**

By: PodiPeti

In [143]:
import pandas as pd
import json
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [144]:
def preprocess_phrases(text, phrases):
    for phrase in phrases:
        text = text.replace(phrase, phrase.replace(' ', '_'))
    return text

In [145]:
def filter_by_pos_with_keywords(text, tags_to_keep, keyword_set):
    tokenized = word_tokenize(text)
    tagged = pos_tag(tokenized)
    # Retain words if they are in keywords
    return " ".join([word for word, tag in tagged if tag in tags_to_keep or word in keyword_set])

In [146]:
# to handle multiple word long keywords
def custom_tokenizer(text, keyword_set):
    tokens = word_tokenize(text)
    processed_tokens = []
    for token in tokens:
        if token in keyword_set:
            processed_tokens.append(token)
        else:
            # Split and check for multi-word phrases
            for keyword in keyword_set:
                if ' ' in keyword and keyword in text:
                    processed_tokens.append(keyword.replace(' ', '_'))
                    text = text.replace(keyword, '')  # avoid duplication
    return processed_tokens

In [147]:
def calculate_tfidf(df, keywords):
    vectorizer = TfidfVectorizer(vocabulary=keywords, ngram_range=(1, 3))  

    # Fit and transform the descriptions
    tfidf_matrix = vectorizer.fit_transform(df['description'])
    tfidf_matrix = tfidf_matrix.transpose()

    # Calculate scores
    tfidf_scores = tfidf_matrix.sum(axis=1).A1
    frequencies = (tfidf_matrix > 0).sum(axis=1).A1

    result_df = pd.DataFrame({
        'frequency': frequencies,
        'tf-idf score': tfidf_scores
    }, index=vectorizer.get_feature_names_out())

    result_df = result_df.sort_values(by='tf-idf score', ascending=False)

    return result_df

INPUT

In [148]:
df = pd.read_csv('input/preprocessed_jobs_all.csv')
tags_to_keep = ['NN', 'NNS', 'NNP', 'NNPS']

with open('keywords/coding_keywords.json', 'r') as file:
    data = json.load(file)
    coding_keywords = data['languages']
coding_df = calculate_tfidf(df, coding_keywords)

with open('keywords/softwares_keywords.json', 'r') as file:
    data = json.load(file)
    softwares_keywords = data['softwares']
softwares_df = calculate_tfidf(df, softwares_keywords)

OUTPUT

In [149]:
with pd.ExcelWriter('output/data/keyword_analysis.xlsx') as writer:
    coding_df.to_excel(writer, sheet_name='Coding Keywords')
    softwares_df.to_excel(writer, sheet_name='Software Keywords')