In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

import os, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

from tqdm import tqdm



In [2]:
import warnings
warnings.filterwarnings("ignore")

In [9]:

df_business_analyst = pd.read_csv('combined scraper results/business analyst clean.csv')
df_data_analyst = pd.read_csv('combined scraper results/data analyst clean.csv')
df_data_engineer = pd.read_csv('combined scraper results/data engineer clean.csv')
df_data_scientist = pd.read_csv('combined scraper results/data scientist clean.csv')

# Combine data for TF-IDF analysis
combined_data = pd.concat([
    df_business_analyst.assign(role="Business Analyst"),
    df_data_analyst.assign(role="Data Analyst"),
    df_data_scientist.assign(role="Data Scientist"),
    df_data_engineer.assign(role="Data Engineer"),
])
# print(combined_data.role.value_counts())
# combined_data
remove_duplicates(combined_data, exclude=['Location','Salary Estimate']) # sometimes salary is location-dependent. we are only analyzing job descriptions so this is fine.
print(combined_data.role.value_counts())

# Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,Size,Type,Easy Apply,Duplicate Count,
# Job Title clean,Salary Type,min_salary,max_salary,job_description_cleaned,Years Experience

combined_data

Duplicate Count
1.0     15855
2.0      7490
3.0       513
4.0       288
6.0       168
5.0       145
10.0      120
50.0      100
7.0        98
97.0       97
31.0       93
42.0       84
8.0        80
26.0       78
37.0       74
24.0       72
71.0       71
33.0       66
16.0       64
9.0        63
60.0       60
15.0       60
12.0       48
47.0       47
40.0       40
39.0       39
17.0       34
11.0       33
30.0       30
29.0       29
28.0       28
27.0       27
13.0       26
22.0       22
20.0       20
19.0       19
14.0       14
Name: count, dtype: int64
role
Data Analyst        8702
Business Analyst    6511
Data Scientist      4626
Data Engineer       2217
Name: count, dtype: int64


Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,Founded,Industry,Revenue,Sector,...,Type,Easy Apply,Duplicate Count,Salary Type,min_salary,max_salary,job_description_cleaned,Job Title clean,Years Experience,role
0,#11598 - Data Collection Moderator,Qualitest,"Mountain View, CA",50-70,3.5,Q Analysts - A Qualitest Company is looking fo...,1997.0,Information Technology Support Services,,Information Technology,...,Company - Private,True,1.0,Annual (K),50.0,70.0,q analyst a qualitest company is looking for a...,data collection moderator,2+,Data Scientist
1,#11885 - Data Collection Technician,Qualitest,"Burlingame, CA",20.00-22.00,3.5,"Q Analysts, a Qualitest Company, is looking fo...",1997.0,Information Technology Support Services,,Information Technology,...,Company - Private,True,1.0,Per Hour,20.0,22.0,q analyst a qualitest company is looking for a...,data collection technician,2+,Data Scientist
2,(2) Sr Business Analyst/s,RiseIT Solutions,"Des Moines, IA",63.00,3.7,Title: (2) Sr Business Analyst/s\nLocation: De...,,Enterprise Software & Network Solutions,,Information Technology,...,Company - Private,False,2.0,Per Hour,63.0,63.0,title senior business analyst s location de mo...,senior business analyst s,,Data Analyst
3,"(Associate) Director, Manufacturing Operations","Novavax, Inc.","Gaithersburg, MD",109-159,3.3,(Nasdaq:NVAX) is a late-stage biotechnology co...,1987.0,Biotech & Pharmaceuticals,$100 to $500 million (USD),Pharmaceutical & Biotechnology,...,Company - Public,False,1.0,Annual (K),109.0,159.0,nasdaq nvax is a late stage biotechnology comp...,director manufacturing operation,7-10,Business Analyst
4,(Bid) Pricing Analyst,Daikin Comfort Technologies,"Denver, CO",67-93,3.3,Overview:\n\n(Bid) Pricing Analyst -Remote\n\n...,1924.0,Machinery Manufacturing,,Manufacturing,...,Subsidiary or Business Segment,False,1.0,Annual (K),67.0,93.0,overview bid pricing analyst remote about moti...,pricing analyst,,Data Analyst
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22051,senior data engineer,Capgemini,"Saint Louis, MO",100-134,3.7,JOB DESCRIPTION\n\nHaving experience with AWS ...,1967.0,Enterprise Software & Network Solutions,$10+ billion (USD),Information Technology,...,Company - Public,False,1.0,Annual (K),100.0,134.0,job description having experience with aws ser...,senior data engineer,7-10,Data Engineer
22052,"senior performance measures analyst , HR Share...",Starbucks,"Seattle, WA",78-133,3.7,Final compensation range is determined by cand...,1971.0,Restaurants & Cafes,$10+ billion (USD),Restaurants & Food Service,...,Company - Public,False,1.0,Annual (K),78.0,133.0,final compensation range is determined by cand...,senior performance measure analyst hr shared s...,5+,Data Analyst
22053,systems Analyst with to fraud mitigation on w2,Formac Inc,"Houston, TX",34.00-48.00,4.2,Systems analyst on w2\n\nHybrid Houston TX\n\n...,2013.0,Information Technology Support Services,,Information Technology,...,Company - Private,True,1.0,Per Hour,34.0,48.0,system analyst on hybrid houston tx hr on work...,system analyst to fraud mitigation on,5,Business Analyst
22054,vCIO,"CITOC, Inc.","Houston, TX",80-100,3.2,Primary Role and Responsibilities\n\nBusiness ...,,-1,$1 to $5 million (USD),,...,Company - Private,True,1.0,Annual (K),80.0,100.0,primary role and responsibility business strat...,vcio,3,Business Analyst
