In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

from tqdm import tqdm



In [2]:
import warnings
warnings.filterwarnings("ignore")

cosine sim for all datapoints

In [3]:
def remove_duplicates(df:pd.DataFrame, exclude=None):
    subset_columns = ['Job Title', 'Company Name', 'Location', 'Salary Estimate', 'Rating', 'Job Description']
    if exclude:
        if isinstance(exclude, list):
            subset_columns = [a for a in subset_columns if a not in exclude]
        else:
            subset_columns.remove(exclude)
    
    df.sort_values(subset_columns, inplace=True)

    df['Duplicate Count'] = df.groupby(subset_columns)['Job Title'].transform('count')
    print(df['Duplicate Count'].value_counts())
    df.drop_duplicates(subset=subset_columns, inplace=True, keep='last')
    df.reset_index(drop=True,inplace=True)
    

In [4]:

df_business_analyst = pd.read_csv('../combined scraper results/business analyst clean.csv')
df_data_analyst = pd.read_csv('../combined scraper results/data analyst clean.csv')
df_data_engineer = pd.read_csv('../combined scraper results/data engineer clean.csv')
df_data_scientist = pd.read_csv('../combined scraper results/data scientist clean.csv')

# Combine data for TF-IDF analysis
combined_data = pd.concat([
    df_business_analyst.assign(role="Business Analyst"),
    df_data_analyst.assign(role="Data Analyst"),
    df_data_scientist.assign(role="Data Scientist"),
    df_data_engineer.assign(role="Data Engineer"),
])
# print(combined_data.role.value_counts())
# combined_data
remove_duplicates(combined_data, exclude=['Location','Salary Estimate']) # sometimes salary is location-dependent. we are only analyzing job descriptions so this is fine.
print(combined_data.role.value_counts())

# Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,Size,Type,Easy Apply,Duplicate Count,
# Job Title clean,Salary Type,min_salary,max_salary,job_description_cleaned,Years Experience
combined_data['Job Title clean underscored'] = combined_data['Job Title clean'].apply(lambda x: str(x).replace(' ','_'))

combined_data

Duplicate Count
1.0     15855
2.0      7490
3.0       513
4.0       288
6.0       168
5.0       145
10.0      120
50.0      100
7.0        98
97.0       97
31.0       93
42.0       84
8.0        80
26.0       78
37.0       74
24.0       72
71.0       71
33.0       66
16.0       64
9.0        63
60.0       60
15.0       60
12.0       48
47.0       47
40.0       40
39.0       39
17.0       34
11.0       33
30.0       30
29.0       29
28.0       28
27.0       27
13.0       26
22.0       22
20.0       20
19.0       19
14.0       14
Name: count, dtype: int64
role
Data Analyst        8702
Business Analyst    6511
Data Scientist      4626
Data Engineer       2217
Name: count, dtype: int64


Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,...,Duplicate Count,Salary Type,min_salary,max_salary,State,job_description_cleaned,Job Title clean,Years Experience,role,Job Title clean underscored
0,#11598 - Data Collection Moderator,Qualitest,"Mountain View, CA",50-70,3.5,Q Analysts - A Qualitest Company is looking fo...,1997.0,Information Technology Support Services,,Information Technology,...,1.0,Annual (K),50.0,70.0,CA,q analyst a qualitest company is looking for a...,data collection moderator,2+,Data Scientist,data_collection_moderator
1,#11885 - Data Collection Technician,Qualitest,"Burlingame, CA",20.00-22.00,3.5,"Q Analysts, a Qualitest Company, is looking fo...",1997.0,Information Technology Support Services,,Information Technology,...,1.0,Per Hour,20.0,22.0,CA,q analyst a qualitest company is looking for a...,data collection technician,2+,Data Scientist,data_collection_technician
2,(2) Sr Business Analyst/s,RiseIT Solutions,"Des Moines, IA",63.00,3.7,Title: (2) Sr Business Analyst/s\nLocation: De...,,Enterprise Software & Network Solutions,,Information Technology,...,2.0,Per Hour,63.0,63.0,IA,title senior business analyst s location de mo...,senior business analyst s,,Data Analyst,senior_business_analyst_s
3,"(Associate) Director, Manufacturing Operations","Novavax, Inc.","Gaithersburg, MD",109-159,3.3,(Nasdaq:NVAX) is a late-stage biotechnology co...,1987.0,Biotech & Pharmaceuticals,$100 to $500 million (USD),Pharmaceutical & Biotechnology,...,1.0,Annual (K),109.0,159.0,MD,nasdaq nvax is a late stage biotechnology comp...,director manufacturing operation,7-10,Business Analyst,director_manufacturing_operation
4,(Bid) Pricing Analyst,Daikin Comfort Technologies,"Denver, CO",67-93,3.3,Overview:\n\n(Bid) Pricing Analyst -Remote\n\n...,1924.0,Machinery Manufacturing,,Manufacturing,...,1.0,Annual (K),67.0,93.0,CO,overview bid pricing analyst remote about moti...,pricing analyst,,Data Analyst,pricing_analyst
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22051,senior data engineer,Capgemini,"Saint Louis, MO",100-134,3.7,JOB DESCRIPTION\n\nHaving experience with AWS ...,1967.0,Enterprise Software & Network Solutions,$10+ billion (USD),Information Technology,...,1.0,Annual (K),100.0,134.0,MO,job description having experience with aws ser...,senior data engineer,7-10,Data Engineer,senior_data_engineer
22052,"senior performance measures analyst , HR Share...",Starbucks,"Seattle, WA",78-133,3.7,Final compensation range is determined by cand...,1971.0,Restaurants & Cafes,$10+ billion (USD),Restaurants & Food Service,...,1.0,Annual (K),78.0,133.0,WA,final compensation range is determined by cand...,senior performance measure analyst hr shared s...,5+,Data Analyst,senior_performance_measure_analyst_hr_shared_s...
22053,systems Analyst with to fraud mitigation on w2,Formac Inc,"Houston, TX",34.00-48.00,4.2,Systems analyst on w2\n\nHybrid Houston TX\n\n...,2013.0,Information Technology Support Services,,Information Technology,...,1.0,Per Hour,34.0,48.0,TX,system analyst on hybrid houston tx hr on work...,system analyst to fraud mitigation on,5,Business Analyst,system_analyst_to_fraud_mitigation_on
22054,vCIO,"CITOC, Inc.","Houston, TX",80-100,3.2,Primary Role and Responsibilities\n\nBusiness ...,,-1,$1 to $5 million (USD),,...,1.0,Annual (K),80.0,100.0,TX,primary role and responsibility business strat...,vcio,3,Business Analyst,vcio


In [11]:
def plot_zipf(df, column, vectorizer, stopwords_name, ax=None, rank_lim=1e7, count_lim=1e6):
    # Fit and transform the data
    if vectorizer.stop_words is not None and isinstance(vectorizer.stop_words, list):
        print(f'removing {len(vectorizer.stop_words)} stopwords')
    count_matrix = vectorizer.fit_transform(df[column].fillna(''))
    ng_ = vectorizer.ngram_range
    print(ng_, '-', count_matrix.shape[1], 'tokens detected')

    # Sum the counts for each term in the corpus
    count_sum = np.array(count_matrix.sum(axis=0)).flatten()

    # Sort the counts in descending order
    sorted_count_sum = np.sort(count_sum)[::-1]

    # Check if axes are provided, if not, create new ones
    if ax is None:
        fig, ax = plt.subplots(figsize=(7 * 1.4, 6 * 1.4))

    # Plotting on the provided or new axes
    ax.loglog(sorted_count_sum, label=f'{ng_[0]}-{ng_[1]} grams stopwords {stopwords_name}', lw=2)

    # Setting labels, limits, and title only if new axes were created
    if ax is None:
        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, rank_lim])
        ax.set_ylim([0, count_lim])
        ax.set_title(f'{column}: Word Freq, {count_matrix.shape[1]} tokens, {stopwords_name} stopwords removed')
        ax.legend()
        ax.grid(True)
        plt.show()
# Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,Size,Type,Easy Apply,Duplicate Count,
# Job Title clean,Salary Type,min_salary,max_salary,job_description_cleaned,Years Experience


In [67]:

from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words

# Counting the number of stopwords in each set
nltk_stopwords = set(nltk_stopwords.words('english'))
sklearn_stopwords = set(sklearn_stop_words)

nltk_stopwords_count = len(nltk_stopwords)
sklearn_stopwords_count = len(sklearn_stopwords)

stopword_sets = dict()
stopword_sets['no'] = None
stopword_sets['nltk sklearn intersect'] = list(nltk_stopwords.intersection(sklearn_stopwords))
stopword_sets['nltk'] = list(nltk_stopwords)
stopword_sets['sklearn'] = list(sklearn_stopwords)
stopword_sets['nltk sklearn union'] = list(nltk_stopwords.union(sklearn_stopwords))




In [18]:
# count_data = {}
# for col in ['job_description_cleaned']:
#     for sw in stopword_sets:
#         for j in range(1, 4):
#             vectorizer = CountVectorizer(ngram_range=(1, j), stop_words=stopword_sets[sw])
#             count_matrix = vectorizer.fit_transform(combined_data[col].fillna(''))
#             count_sum = np.array(count_matrix.sum(axis=0)).flatten()
#             sorted_count_sum = np.sort(count_sum)[::-1]
#             count_data[(col, sw, j)] = sorted_count_sum

In [69]:
for col in ['job_description_cleaned']:
    for sw in stopword_sets:
        fig, ax = plt.subplots(figsize=(6*1.4, 6*1.4))
        for j in range(1, 4):
            sorted_count_sum = count_data[(col, sw, j)]
            # print()
            ax.loglog(sorted_count_sum, label=f'N-gram (1, {j})\n{len(sorted_count_sum)} tokens')

        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, 1e7])
        ax.set_ylim([0, 1e6])
        ax.set_title(f'{col}: Word Freq, {len(stopword_sets[sw]) if stopword_sets[sw] else 0} stopwords ({sw})')
        ax.legend()
        ax.grid(True)
        ax.set_aspect('equal', adjustable='box')
        plt.tight_layout()
        plt.savefig(f'zipf plots combined/zipf {col} stopwords {sw}.png', dpi=300)
        plt.close()


In [73]:
stopword_sets.keys()

dict_keys(['no', 'nltk sklearn intersect', 'nltk', 'sklearn', 'nltk sklearn union'])

In [77]:
[len(stopword_sets[a]) for a in stopword_sets if stopword_sets[a]]

[119, 179, 318, 378]

In [70]:
for col in ['job_description_cleaned']:
    for j in range(1, 4):
        fig, ax = plt.subplots(figsize=(6*1.4, 6*1.4))
        for sw in stopword_sets:
            sorted_count_sum = count_data[(col, sw, j)]
            ax.loglog(sorted_count_sum, label=f'{len(stopword_sets[sw]) if stopword_sets[sw] else 0} stopwords ({sw})\n{len(sorted_count_sum)} tokens')

        ax.set_xlabel('Rank')
        ax.set_ylabel('Counts')
        ax.set_xlim([0, 1e7])
        ax.set_ylim([0, 1e6])
        ax.set_title(f'{col}: Word Freq, ngram range (1, {j})')
        ax.legend()
        ax.grid(True)
        ax.set_aspect('equal', adjustable='box')
        plt.tight_layout()
        plt.savefig(f'zipf plots combined/zipf {col} ngrams 1-{j}.png', dpi=300)
        plt.close()


In [76]:
for col in ['job_description_cleaned']:
    fig, ax = plt.subplots(figsize=(8*1.4, 8*1.4))
    for j in range(1, 4):
        for sw in ['no','nltk sklearn intersect']:    
        # for sw in ['no', 'nltk sklearn intersect', 'nltk sklearn union']:
            sorted_count_sum = count_data[(col, sw, j)]
            ax.loglog(sorted_count_sum, label=f'{len(stopword_sets[sw]) if stopword_sets[sw] else 0} stopwords ({sw})\nN-gram (1, {j})\n{len(sorted_count_sum)} tokens')

    ax.set_xlabel('Rank')
    ax.set_ylabel('Counts')
    ax.set_xlim([0, 1e7])
    ax.set_ylim([0, 1e6])
    ax.set_title(f'{col}: Combined Word Freq Analysis')
    ax.legend()
    # ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    ax.grid(True)
    ax.set_aspect('equal', adjustable='box')
    plt.tight_layout()
    plt.savefig(f'zipf plots combined/zipf {col} combined.png', dpi=300)
    plt.close()


In [72]:
asdf

NameError: name 'asdf' is not defined

In [None]:
for col in ['Job Description','job_description_cleaned','Job Title','Job Title clean']:
    for i in range(1,4):
        for j in range(i,4):
            for sw in stopword_sets:
                print(col, (i,j), '- stopwords:', sw)
                plot_zipf(combined_data, col, CountVectorizer(ngram_range=(i,j), stop_words=stopword_sets[sw]), stopwords_name=sw)
                plt.savefig(f'zipf plots/zipf {col} ngrams {i}-{j} stopwords {sw}', dpi=300)
                plt.close()

In [None]:
for col in ['Job Title','Job Title clean']:
    for i in range(1,4):
        for j in range(i,4):
            for sw in stopword_sets:
                print(col, (i,j), '- stopwords:', sw)
                plot_zipf(combined_data, col, CountVectorizer(ngram_range=(i,j), stop_words=stopword_sets[sw]), stopwords_name=sw)
                plt.savefig(f'zipf plots/zipf {col} ngrams {i}-{j} stopwords {sw}', dpi=300)
                plt.close()

In [None]:
dfs = []

for col in ['Job Description','job_description_cleaned','Job Title','Job Title clean']:
    with open(f'{col}.txt', 'r') as file:
        file_content = file.read()

    df = pd.DataFrame()
    lines = file_content.split('\n')
    i = 0
    while i < len(lines):
        line = lines[i]

        if col in line:
            # Extracting the N-gram and stopwords information
            n_gram_info, stopwords_info = line.split(' - ')
            n_gram = n_gram_info.split('(')[-1].split(')')[0]  # Extracting the N-gram range
            n0,n1 = n_gram.split(', ')
            stopwords = stopwords_info.split(': ')[-1]

            if stopwords == 'no':
                # If stopwords are 'no', the token count is in the next line
                token_count = int(lines[i + 1].split(' - ')[-1].split()[0])
                i += 2  # Incrementing by 2 to skip the next line
            else:
                # If stopwords are specified, the token count is in the line after the next
                token_count = int(lines[i + 2].split(' - ')[-1].split()[0])
                i += 3  # Incrementing by 3 to skip the next two lines

            temp_df = pd.DataFrame({'ngram range': [n_gram], 'ngram min': [int(n0)], 'ngram max': [int(n1)], 'stopwords': [stopwords], 'num tokens': [token_count]})
            df = pd.concat([df, temp_df], ignore_index=True)
        else:
            i += 1  # Incrementing to check the next line

    # print(df)
    df['column'] = col
    df = df[['column','ngram range','ngram min', 'ngram max', 'stopwords', 'num tokens']]
    dfs.append(df)



In [None]:
dfs[0]

Unnamed: 0,column,ngram range,ngram min,ngram max,stopwords,num tokens
0,Job Description,"1, 1",1,1,no,67695
1,Job Description,"1, 1",1,1,nltk,67557
2,Job Description,"1, 1",1,1,sklearn,67400
3,Job Description,"1, 1",1,1,nltk sklearn union,67378
4,Job Description,"1, 1",1,1,nltk sklearn intersect,67579
5,Job Description,"1, 2",1,2,no,1586888
6,Job Description,"1, 2",1,2,nltk,1924770
7,Job Description,"1, 2",1,2,sklearn,1885504
8,Job Description,"1, 2",1,2,nltk sklearn union,1881728
9,Job Description,"1, 2",1,2,nltk sklearn intersect,1928322


In [None]:
ret = pd.concat(dfs)
ret.rename(columns={'num tokens':'num tokens detected by vectorizer'},inplace=True)
ret.to_csv('stopword token analysis.csv',index=False)
ret

Unnamed: 0,column,ngram range,ngram min,ngram max,stopwords,num tokens detected by vectorizer
0,Job Description,"1, 1",1,1,no,67695
1,Job Description,"1, 1",1,1,nltk,67557
2,Job Description,"1, 1",1,1,sklearn,67400
3,Job Description,"1, 1",1,1,nltk sklearn union,67378
4,Job Description,"1, 1",1,1,nltk sklearn intersect,67579
...,...,...,...,...,...,...
25,Job Title clean,"3, 3",3,3,no,23952
26,Job Title clean,"3, 3",3,3,nltk,22413
27,Job Title clean,"3, 3",3,3,sklearn,21753
28,Job Title clean,"3, 3",3,3,nltk sklearn union,21742
