## Combining data gathered using ampify API

In [None]:
import pandas as pd
import re

df_analyst = pd.read_csv(r'C:\Users\shiva\OneDrive\Documents\Github\Job_Trend_Analysis\datasets\job_listings_data_analyst.csv')
df_engineer = pd.read_csv(r'C:\Users\shiva\OneDrive\Documents\Github\Job_Trend_Analysis\datasets\job_listings_data_engineer.csv')
df_scientist = pd.read_csv(r'C:\Users\shiva\OneDrive\Documents\Github\Job_Trend_Analysis\datasets\job_listings_data_scientist.csv')

df_analyst['job_category'] = 'Data Analyst'
df_engineer['job_category'] = 'Data Engineer'
df_scientist['job_category'] = 'Data Scientist'

combined_df = pd.concat([df_analyst, df_engineer, df_scientist], ignore_index=True)

combined_df.to_csv(r'C:\Users\shiva\OneDrive\Documents\Github\Job_Trend_Analysis\datasets\combined_job_listings.csv', index=False)
print("Combined data saved to 'C:\\Users\\shiva\\OneDrive\\Documents\\Github\\Job_Trend_Analysis\\datasets\\combined_job_listings.csv'")


Combined data saved to 'C:\Users\shiva\OneDrive\Documents\Github\Job_Trend_Analysis\datasets\combined_job_listings.csv'


## Data Cleaning

In [40]:
df = pd.read_csv('datasets/combined_job_listings.csv')

def categorize_job_title(title):
    title = title.lower()
    if re.search(r'\bdata\s+scientist\b', title):
        return 'Data Scientist'
    elif re.search(r'\bdata\s+engineer\b', title):
        return 'Data Engineer'
    elif re.search(r'\bdata\s+analyst\b', title):
        return 'Data Analyst'
    else:
        return 'Other' 

df['job_category'] = df['title'].apply(categorize_job_title)

df = df[df['job_category'] != 'Other']



In [41]:
 df['job_category'].unique()

array(['Data Analyst', 'Data Scientist', 'Data Engineer'], dtype=object)

In [42]:
df['avg_salary'] = df[['min_amount', 'max_amount']].mean(axis=1)

In [43]:
df['avg_salary'].isna().sum()

308

In [44]:
df.head()

Unnamed: 0,company,currency,date_posted,description,emails,interval,is_remote,job_function,job_level,job_type,job_url,job_url_direct,listing_type,location,max_amount,min_amount,title,job_category,avg_salary
0,CVS Health,USD,1738368000000.0,Bring your heart to CVS Health. Every one of u...,,yearly,False,,,fulltime,https://jobs.cvshealth.com/us/en/job/R0486462/...,,,"Hartford, CT",158620.0,72100.0,Senior Data Analyst,Data Analyst,115360.0
2,Mass General Brigham,,1738282000000.0,Site: The General Hospital Corporation\r\n\r\n...,,,True,,,fulltime,https://jobs.mehi.masstech.org/companies/mass-...,,,"Boston, MA",,,Data Analyst I,Data Analyst,
4,Pitchbook,USD,1738195000000.0,"At PitchBook, we are always looking forward. W...",,yearly,True,,,fulltime,https://careers.pitchbook.com/global/en/job/44...,,,"New York, NY",86400.0,72650.0,"Associate Data Analyst, Leveraged Loans",Data Analyst,79525.0
5,Amazon.com Services LLC,,1738195000000.0,Basic qualifications for an Supply Chain Data ...,,,False,,,fulltime,https://www.indeed.com/viewjob?jk=c3b830ca1a25...,,,"North Reading, MA",,,"Supply Chain Data Analyst, AR NPI - STL",Data Analyst,
7,SnapX,,1738109000000.0,**Required:**\r\n• *10+ yrs exp Data Analyst/E...,,,False,,,fulltime,https://snapx.ai/view-job/data-analyst-enginee...,,,"New York, NY",,,Data Analyst/Engineer,Data Analyst,


In [45]:
df.shape[0]

367

In [46]:
columns_to_remove = [
    'currency','date_posted', 'description', 'emails', 'interval', 'is_remote', 'job_function', 'job_level', 'job_type', 'job_url', 'job_url_direct',
    'listing_type', 'max_amount', 'min_amount'
]

df = df.drop(columns=columns_to_remove)

In [47]:
df.to_csv('datasets/combined_job_listings_cleaned.csv', index=False)
print("Cleaned data saved to 'datasets/combined_job_listings_cleaned.csv'")

Cleaned data saved to 'datasets/combined_job_listings_cleaned.csv'


## Cleaning Kaggle API Dataset

In [52]:

df_new = pd.read_csv('datasets/Glassdoor_Salary_Cleaned_Version.csv') 

def categorize_job_title(title):
    title = title.lower()
    if re.search(r'data\s+scientist', title):
        return 'Data Scientist'
    elif re.search(r'data\s+engineer', title):
        return 'Data Engineer'
    elif re.search(r'data\s+analyst', title):
        return 'Data Analyst'
    else:
        return 'Other'  

df_new['job_category'] = df_new['Job Title'].apply(categorize_job_title)

df_new = df_new[df_new['job_category'] != 'Other']



columns_to_remove = [
    'Salary Estimate', 'Job Description', 'Rating', 'Headquarters', 'Size', 'Founded',
    'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors', 'hourly',
    'employer_provided', 'min_salary', 'max_salary', 'company_txt', 'job_state',
    'same_state', 'age'
]

df_new = df_new.drop(columns=columns_to_remove)

df_new.to_csv('datasets/Glassdoor_Salary_Cleaned_Version_Reduced.csv', index=False)
print("Updated data saved to 'datasets/Glassdoor_Salary_Cleaned_Version_Reduced.csv'")



Updated data saved to 'datasets/Glassdoor_Salary_Cleaned_Version_Reduced.csv'


## Combining Data from kaggle api and apify api

In [55]:
df_job_listings = pd.read_csv('datasets/combined_job_listings_cleaned.csv')
df_glassdoor_salary = pd.read_csv('datasets/Glassdoor_Salary_Cleaned_Version_Reduced.csv')

df_job_listings.rename(columns={'company': 'Company Name', 'location': 'Location', 'title': 'Job Title'}, inplace=True)

df_job_listings['Company Name'] = df_job_listings['Company Name'].str.strip().str.lower()
df_glassdoor_salary['Company Name'] = df_glassdoor_salary['Company Name'].str.strip().str.lower()

df_job_listings['Location'] = df_job_listings['Location'].str.strip().str.lower()
df_glassdoor_salary['Location'] = df_glassdoor_salary['Location'].str.strip().str.lower()

df_job_listings['Job Title'] = df_job_listings['Job Title'].str.strip().str.lower()
df_glassdoor_salary['Job Title'] = df_glassdoor_salary['Job Title'].str.strip().str.lower()

df_final = pd.merge(df_job_listings, df_glassdoor_salary, 
                              on=['Job Title', 'Company Name', 'Location', 'job_category'], 
                              how='outer')

df_final.to_csv('datasets/combined_final_data.csv', index=False)
print("Complete merged data saved to 'datasets/combined_final_data.csv'")


Complete merged data saved to 'datasets/combined_final_data.csv'


## Cleaning the final Combined Data

In [69]:
import pandas as pd

df = pd.read_csv('datasets/combined_final_data.csv')

df.loc[df['avg_salary_y'].notna(), 'avg_salary_y'] = df['avg_salary_y'] * 2080

df['avg_salary'] = df['avg_salary_x'].fillna(df['avg_salary_y'])

df.drop(['avg_salary_x', 'avg_salary_y'], axis=1, inplace=True)

df.to_csv('datasets/combined_final_data_cleaned.csv', index=False)
print("Updated and unified salary data saved to 'datasets/combined_final_data_cleaned.csv'")


Updated and unified salary data saved to 'datasets/combined_final_data_cleaned.csv'


In [60]:
df.head()

Unnamed: 0,Company Name,Location,Job Title,job_category,python_yn,R_yn,spark,aws,excel,avg_salary
0,bits,london,"(senior) data engineer (ml, big data)",Data Engineer,,,,,,
1,walmart,"germantown, md","(usa) principal, data scientist",Data Scientist,,,,,,198000.0
2,walmart,"reston, va","(usa) principal, data scientist",Data Scientist,,,,,,198000.0
3,walmart,"germantown, md","(usa) staff, data scientist",Data Scientist,,,,,,198000.0
4,walmart,"reston, va","(usa) staff, data scientist",Data Scientist,,,,,,198000.0


In [65]:
df['avg_salary'].isna().sum()

308

In [67]:
df.describe()

Unnamed: 0,python_yn,R_yn,spark,aws,excel,avg_salary
count,497.0,497.0,497.0,497.0,497.0,556.0
mean,0.649899,0.004024,0.307847,0.291751,0.539235,207741.151079
std,0.477482,0.063372,0.462068,0.455026,0.49896,76330.908422
min,0.0,0.0,0.0,0.0,0.0,28080.0
25%,0.0,0.0,0.0,0.0,0.0,151840.0
50%,1.0,0.0,0.0,0.0,1.0,203840.0
75%,1.0,0.0,1.0,1.0,1.0,251940.0
max,1.0,1.0,1.0,1.0,1.0,494000.0


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864 entries, 0 to 863
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Company Name  864 non-null    object 
 1   Location      858 non-null    object 
 2   Job Title     864 non-null    object 
 3   job_category  864 non-null    object 
 4   python_yn     497 non-null    float64
 5   R_yn          497 non-null    float64
 6   spark         497 non-null    float64
 7   aws           497 non-null    float64
 8   excel         497 non-null    float64
 9   avg_salary    556 non-null    float64
dtypes: float64(6), object(4)
memory usage: 67.6+ KB
