In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

from tqdm import tqdm



In [6]:
import warnings
warnings.filterwarnings("ignore")

cosine sim for all datapoints

In [7]:
def remove_duplicates(df:pd.DataFrame, exclude=None):
    subset_columns = ['Job Title', 'Company Name', 'Location', 'Salary Estimate', 'Rating', 'Job Description']
    if exclude:
        if isinstance(exclude, list):
            subset_columns = [a for a in subset_columns if a not in exclude]
        else:
            subset_columns.remove(exclude)
    
    df.sort_values(subset_columns, inplace=True)

    df['Duplicate Count'] = df.groupby(subset_columns)['Job Title'].transform('count')
    print(df['Duplicate Count'].value_counts())
    df.drop_duplicates(subset=subset_columns, inplace=True, keep='last')
    df.reset_index(drop=True,inplace=True)
    

In [8]:

df_business_analyst = pd.read_csv('../data_jobs_data/data/BusinessAnalyst clean.csv')
df_data_analyst = pd.read_csv('../data_jobs_data/data/DataAnalyst clean.csv')
df_data_engineer = pd.read_csv('../data_jobs_data/data/DataEngineer clean.csv')
df_data_scientist = pd.read_csv('../data_jobs_data/data/DataScientist clean.csv')

# Combine data for TF-IDF analysis
combined_data = pd.concat([
    df_business_analyst.assign(role="Business Analyst"),
    df_data_analyst.assign(role="Data Analyst"),
    df_data_scientist.assign(role="Data Scientist"),
    df_data_engineer.assign(role="Data Engineer"),
])
# print(combined_data.role.value_counts())
# combined_data
remove_duplicates(combined_data, exclude=['Location','Salary Estimate']) # sometimes salary is location-dependent. we are only analyzing job descriptions so this is fine.
print(combined_data.role.value_counts())

# Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,Size,Type,Easy Apply,Duplicate Count,
# Job Title clean,Salary Type,min_salary,max_salary,job_description_cleaned,Years Experience

combined_data

Duplicate Count
1.0     10704
2.0      2288
3.0       990
4.0       192
5.0        75
8.0        56
6.0        54
47.0       47
7.0        42
26.0       26
11.0       22
9.0        18
15.0       15
14.0       14
10.0       10
Name: count, dtype: int64
role
Data Analyst        4691
Business Analyst    3485
Data Scientist      3114
Data Engineer       2322
Name: count, dtype: int64


Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,...,Easy Apply,Duplicate Count,Salary Type,min_salary,max_salary,State,job_description_cleaned,Job Title clean,Years Experience,role
0,!!!100% Remote!!! Sr. Data and Integration Eng...,Amerit Consulting,"Dallas, TX",74-134,3.5,Job Description\nOur client is an American for...,2002.0,State & Regional Agencies,$5 to $10 million (USD),Government,...,False,1.0,Annual (K),74.0,134.0,TX,job description our client is an american for ...,remote senior data and integration engineer,,Data Engineer
1,"""Business Analyst/ Data Analyst with experienc...",ESolutions Inc,"Tampa, FL",62-113,4.0,Job Title: Business Analyst/ Data Analyst with...,1999.0,IT Services,$50 to $100 million (USD),Information Technology,...,False,1.0,Annual (K),62.0,113.0,FL,job title business analyst data analyst with e...,business analyst data analyst experience on an...,,Data Analyst
2,#104252 Division Data and Financial Analyst,UC San Diego,"San Diego, CA",34-61,4.3,This position will remain open until filled.\n...,1960.0,Colleges & Universities,,Education,...,False,1.0,Annual (K),34.0,61.0,CA,this position will remain open until filled uc...,division data and financial analyst,,Data Analyst
3,#104293 Business Tech Support Analyst,UC San Diego,"San Diego, CA",31-61,4.3,UCSD Layoff from Career Appointment: Apply by ...,1960.0,Colleges & Universities,,Education,...,False,1.0,Annual (K),31.0,61.0,CA,ucsd layoff from career appointment apply by f...,business tech support analyst,,Business Analyst
4,#104733 HPC Systems and Data Engineer,UC San Diego,"San Diego, CA",46-92,4.3,The effects of the COVID-19 pandemic have impa...,1960.0,Colleges & Universities,,Education,...,False,2.0,Annual (K),46.0,92.0,CA,the effect of the covid pandemic have impacted...,hpc system and data engineer,,Data Engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13607,senior data systems analyst,Citi,Texas,-1,3.7,Position Summary: The Senior Data Systems anal...,1812.0,Investment Banking & Asset Management,$10+ billion (USD),Finance,...,False,1.0,Annual (K),,,TX,position summary the senior data system analys...,senior data system analyst,14,Data Analyst
13608,senior data systems analyst,Citibank,Texas,85-153,3.8,Position Summary: The Senior Data Systems anal...,,Lending,,Finance,...,False,1.0,Annual (K),85.0,153.0,TX,position summary the senior data system analys...,senior data system analyst,14,Data Analyst
13609,systems Analyst 1596 (d),The City of Los Angeles,"Los Angeles, CA",48-100,3.6,"DUTIESANNUAL SALARY$70,156 to $102,562The sala...",,Municipal Governments,,Government,...,False,1.0,Annual (K),48.0,100.0,CA,dutiesannual salary to salary in the departmen...,system analyst,,Business Analyst
13610,"{""title"":""GAO Analyst (Data Analysis)"",""extend...",Legislative Branch,"Washington, DC",61-108,4.3,DutiesHelpDutiesSummaryThis position is locate...,1789.0,State & Regional Agencies,,Government,...,False,3.0,Annual (K),61.0,108.0,,dutieshelpdutiessummarythis position is locate...,title gao analyst extended opening_type null,,Data Analyst


In [9]:

from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

# Counting the number of stopwords in each set
nltk_stopwords = set(nltk_stopwords.words('english'))
sklearn_stopwords = set(sklearn_stop_words)

nltk_stopwords_count = len(nltk_stopwords)
sklearn_stopwords_count = len(sklearn_stopwords)

stopword_sets = dict()
stopword_sets['no'] = None
stopword_sets['nltk'] = list(nltk_stopwords)
stopword_sets['sklearn'] = list(sklearn_stopwords)
stopword_sets['nltk sklearn union'] = list(nltk_stopwords.union(sklearn_stopwords))
stopword_sets['nltk sklearn intersect'] = list(nltk_stopwords.intersection(sklearn_stopwords))



In [10]:
def plot_zipf(df, column, vectorizer, stopwords_name, ax=None, rank_lim=1e7, count_lim=1e6):
    # Fit and transform the data
    if vectorizer.stop_words is not None and isinstance(vectorizer.stop_words, list):
        print(f'removing {len(vectorizer.stop_words)} stopwords')
    count_matrix = vectorizer.fit_transform(df[column].fillna(''))
    ng_ = vectorizer.ngram_range
    print(ng_, '-', count_matrix.shape[1], 'tokens detected')

    # Sum the counts for each term in the corpus
    count_sum = np.array(count_matrix.sum(axis=0)).flatten()

    # Sort the counts in descending order
    sorted_count_sum = np.sort(count_sum)[::-1]

    # Check if axes are provided, if not, create new ones
    if ax is None:
        fig, ax = plt.subplots(figsize=(7 * 1.4, 6 * 1.4))

    # Plotting on the provided or new axes
    ax.loglog(sorted_count_sum, label=f'{ng_[0]}-{ng_[1]} grams stopwords {stopwords_name}', lw=2)

    # Setting labels, limits, and title only if new axes were created
    if ax is None:
        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, rank_lim])
        ax.set_ylim([0, count_lim])
        ax.set_title(f'{column}: Word Freq, {count_matrix.shape[1]} tokens, {stopwords_name} stopwords removed')
        ax.legend()
        ax.grid(True)
        plt.show()
# Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,Size,Type,Easy Apply,Duplicate Count,
# Job Title clean,Salary Type,min_salary,max_salary,job_description_cleaned,Years Experience


In [11]:
count_data = {}
for col in ['job_description_cleaned']:
    for sw in stopword_sets:
        for j in range(1, 4):
            vectorizer = CountVectorizer(ngram_range=(1, j), stop_words=stopword_sets[sw])
            count_matrix = vectorizer.fit_transform(combined_data[col].fillna(''))
            count_sum = np.array(count_matrix.sum(axis=0)).flatten()
            sorted_count_sum = np.sort(count_sum)[::-1]
            count_data[(col, sw, j)] = sorted_count_sum

In [None]:
for col in ['job_description_cleaned']:
    for sw in stopword_sets:
        fig, ax = plt.subplots(figsize=(6*1.4, 6*1.4))
        for j in range(1, 4):
            sorted_count_sum = count_data[(col, sw, j)]
            ax.loglog(sorted_count_sum, label=f'N-gram range (1, {j})')

        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, 1e7])
        ax.set_ylim([0, 1e6])
        ax.set_title(f'{col}: Word Freq, stopwords {sw}')
        ax.legend()
        ax.grid(True)
        ax.set_aspect('equal', adjustable='box')
        plt.tight_layout()
        plt.savefig(f'zipf plots combined/zipf {col} stopwords {sw}.png', dpi=300)
        plt.close()


In [None]:
for col in ['job_description_cleaned']:
    for j in range(1, 4):
        fig, ax = plt.subplots(figsize=(6*1.4, 6*1.4))
        for sw in ['no','nltk sklearn union']:
            sorted_count_sum = count_data[(col, sw, j)]
            ax.loglog(sorted_count_sum, label=f'Stopwords {sw}')

        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, 1e7])
        ax.set_ylim([0, 1e6])
        ax.set_title(f'{col}: Word Freq, ngram range (1, {j})')
        ax.legend()
        ax.grid(True)
        ax.set_aspect('equal', adjustable='box')
        plt.tight_layout()
        plt.savefig(f'zipf plots combined/zipf {col} ngrams 1-{j}.png', dpi=300)
        plt.close()


In [None]:
for col in ['job_description_cleaned']:
    fig, ax = plt.subplots(figsize=(11*1.4, 6*1.4))
    for sw in ['no','nltk sklearn union']:
        for j in range(1, 4):
            sorted_count_sum = count_data[(col, sw, j)]
            ax.loglog(sorted_count_sum, label=f'Stopwords {sw}, N-gram (1, {j})')

    ax.set_xlabel('Rank')
    ax.set_ylabel('Counts')
    ax.set_xlim([0, 1e7])
    ax.set_ylim([0, 1e6])
    ax.set_title(f'{col}: Combined Word Freq Analysis')
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    ax.grid(True)
    ax.set_aspect('equal', adjustable='box')
    plt.tight_layout()
    plt.savefig(f'zipf plots combined/zipf {col} combined.png', dpi=300)
    plt.close()


In [None]:
asdf

In [None]:
for col in ['Job Description','job_description_cleaned','Job Title','Job Title clean']:
    for i in range(1,4):
        for j in range(i,4):
            for sw in stopword_sets:
                print(col, (i,j), '- stopwords:', sw)
                plot_zipf(combined_data, col, CountVectorizer(ngram_range=(i,j), stop_words=stopword_sets[sw]), stopwords_name=sw)
                plt.savefig(f'zipf plots/zipf {col} ngrams {i}-{j} stopwords {sw}', dpi=300)
                plt.close()

In [None]:
for col in ['Job Title','Job Title clean']:
    for i in range(1,4):
        for j in range(i,4):
            for sw in stopword_sets:
                print(col, (i,j), '- stopwords:', sw)
                plot_zipf(combined_data, col, CountVectorizer(ngram_range=(i,j), stop_words=stopword_sets[sw]), stopwords_name=sw)
                plt.savefig(f'zipf plots/zipf {col} ngrams {i}-{j} stopwords {sw}', dpi=300)
                plt.close()

In [19]:
dfs = []

for col in ['Job Description','job_description_cleaned','Job Title','Job Title clean']:
    with open(f'{col}.txt', 'r') as file:
        file_content = file.read()

    df = pd.DataFrame()
    lines = file_content.split('\n')
    i = 0
    while i < len(lines):
        line = lines[i]

        if col in line:
            # Extracting the N-gram and stopwords information
            n_gram_info, stopwords_info = line.split(' - ')
            n_gram = n_gram_info.split('(')[-1].split(')')[0]  # Extracting the N-gram range
            n0,n1 = n_gram.split(', ')
            stopwords = stopwords_info.split(': ')[-1]

            if stopwords == 'no':
                # If stopwords are 'no', the token count is in the next line
                token_count = int(lines[i + 1].split(' - ')[-1].split()[0])
                i += 2  # Incrementing by 2 to skip the next line
            else:
                # If stopwords are specified, the token count is in the line after the next
                token_count = int(lines[i + 2].split(' - ')[-1].split()[0])
                i += 3  # Incrementing by 3 to skip the next two lines

            temp_df = pd.DataFrame({'ngram range': [n_gram], 'ngram min': [int(n0)], 'ngram max': [int(n1)], 'stopwords': [stopwords], 'num tokens': [token_count]})
            df = pd.concat([df, temp_df], ignore_index=True)
        else:
            i += 1  # Incrementing to check the next line

    # print(df)
    df['column'] = col
    df = df[['column','ngram range','ngram min', 'ngram max', 'stopwords', 'num tokens']]
    dfs.append(df)



In [22]:
dfs[0]

Unnamed: 0,column,ngram range,ngram min,ngram max,stopwords,num tokens
0,Job Description,"1, 1",1,1,no,67695
1,Job Description,"1, 1",1,1,nltk,67557
2,Job Description,"1, 1",1,1,sklearn,67400
3,Job Description,"1, 1",1,1,nltk sklearn union,67378
4,Job Description,"1, 1",1,1,nltk sklearn intersect,67579
5,Job Description,"1, 2",1,2,no,1586888
6,Job Description,"1, 2",1,2,nltk,1924770
7,Job Description,"1, 2",1,2,sklearn,1885504
8,Job Description,"1, 2",1,2,nltk sklearn union,1881728
9,Job Description,"1, 2",1,2,nltk sklearn intersect,1928322


In [25]:
ret = pd.concat(dfs)
ret.rename(columns={'num tokens':'num tokens detected by vectorizer'},inplace=True)
ret.to_csv('stopword token analysis.csv',index=False)
ret

Unnamed: 0,column,ngram range,ngram min,ngram max,stopwords,num tokens detected by vectorizer
0,Job Description,"1, 1",1,1,no,67695
1,Job Description,"1, 1",1,1,nltk,67557
2,Job Description,"1, 1",1,1,sklearn,67400
3,Job Description,"1, 1",1,1,nltk sklearn union,67378
4,Job Description,"1, 1",1,1,nltk sklearn intersect,67579
...,...,...,...,...,...,...
25,Job Title clean,"3, 3",3,3,no,23952
26,Job Title clean,"3, 3",3,3,nltk,22413
27,Job Title clean,"3, 3",3,3,sklearn,21753
28,Job Title clean,"3, 3",3,3,nltk sklearn union,21742
