In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os

In [2]:
def read_html_from_file(path):
    with open(path, 'r', encoding='utf-8', errors='replace') as file:
            content = file.read()
            return content

In [3]:
def parse_job_details(soup):
    jobs = soup.find_all('div', class_='job_seen_beacon')
    job_titles, companies, locations, salaries, job_types, descriptions, dates, links = [], [], [], [], [], [], [], []

    # Loop through each job listing
    for job in jobs:
        # Get job titles
        title_elements = job.find_all('h2', class_=['jobTitle css-198pbd eu4oa1w0', 'jobTitle jobTitle-newJob css-198pbd eu4oa1w0'])
        for h2 in title_elements:
            job_name_span = h2.find('span', id=lambda x: x and x.startswith('jobTitle-'))
            if job_name_span:
                job_name = job_name_span.get('title', job_name_span.text)
                job_titles.append(job_name)

        # Get company and location details
        company_div = job.find('span', {'data-testid': 'company-name'})
        location_div = job.find('div', {'data-testid': 'text-location'})
        if company_div:
            companies.append(company_div.text)
        if location_div:
            locations.append(location_div.text)

        # Get salary and job type details
        pay_div = job.find('div', class_='salary-snippet-container')
        type_div = job.find('div', class_='metadata css-5zy3wz eu4oa1w0')
        salaries.append(pay_div.text.strip() if pay_div else 'Not Provided')
        job_types.append(type_div.find('div', {'data-testid': 'attribute_snippet_testid'}).text.strip() if type_div else 'Not Provided')

        # Get job description
        descr = job.find('div', class_='css-1u8dvic eu4oa1w0')
        if descr:
            descriptions.append(descr.find('ul').text.strip() if descr.find('ul') else '')
        
        # Get job posting date
        date_element = job.find('span', class_='css-qvloho eu4oa1w0')
        if date_element:
            dates.append(date_element.text.strip())

        # Get job link
        a_tag = job.find('h2', class_='jobTitle').find('a') if job.find('h2', class_='jobTitle') else None
        if a_tag:
            links.append("https://www.indeed.com" + a_tag.get('href', ''))

    return job_titles, companies, locations, salaries, job_types, descriptions, dates, links

In [4]:
def save_to_csv(dataframe, file_number):
    output_text = f"Cleaned_csvs/AI-UnitedStates-Page({file_number}).csv"
    dataframe.to_csv(output_text, index=False)

In [5]:
def process_files_in_folder(folder_path):
    # Get a list of all .txt files in the folder
    file_list = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    
    for file_name in file_list:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            content = read_html_from_file(file_path)
            soup = BeautifulSoup(content, 'html.parser')
            job_titles, companies, locations, salaries, job_types, descriptions, dates, links = parse_job_details(soup)
            file_number = file_name.split('_')[1].split('.')[0]
            df = pd.DataFrame({
            'Job Title': job_titles,
            'Company': companies,
            'Location': locations,
            'Salary($)': salaries,
            'Job-Type': job_types,
            'Job-Description': descriptions,
            'Raw_Link': links})
            csv_file_name = f'Cleaned_csvs/page_{file_number}.csv'
            df.to_csv(csv_file_name, index=False)
            print(f'Saved {csv_file_name}')

In [6]:
folder_path = '.'
process_files_in_folder(folder_path)

Saved Cleaned_csvs/page_1.csv
Saved Cleaned_csvs/page_10.csv
Saved Cleaned_csvs/page_100.csv
Saved Cleaned_csvs/page_1000.csv
Saved Cleaned_csvs/page_1001.csv
Saved Cleaned_csvs/page_101.csv
Saved Cleaned_csvs/page_102.csv
Saved Cleaned_csvs/page_103.csv
Saved Cleaned_csvs/page_104.csv
Saved Cleaned_csvs/page_105.csv
Saved Cleaned_csvs/page_106.csv
Saved Cleaned_csvs/page_107.csv
Saved Cleaned_csvs/page_108.csv
Saved Cleaned_csvs/page_109.csv
Saved Cleaned_csvs/page_11.csv
Saved Cleaned_csvs/page_110.csv
Saved Cleaned_csvs/page_111.csv
Saved Cleaned_csvs/page_112.csv
Saved Cleaned_csvs/page_113.csv
Saved Cleaned_csvs/page_114.csv
Saved Cleaned_csvs/page_115.csv
Saved Cleaned_csvs/page_116.csv
Saved Cleaned_csvs/page_117.csv
Saved Cleaned_csvs/page_118.csv
Saved Cleaned_csvs/page_119.csv
Saved Cleaned_csvs/page_12.csv
Saved Cleaned_csvs/page_120.csv
Saved Cleaned_csvs/page_121.csv
Saved Cleaned_csvs/page_122.csv
Saved Cleaned_csvs/page_123.csv
Saved Cleaned_csvs/page_124.csv
Saved Clean

Saved Cleaned_csvs/page_329.csv
Saved Cleaned_csvs/page_33.csv
Saved Cleaned_csvs/page_330.csv
Saved Cleaned_csvs/page_331.csv
Saved Cleaned_csvs/page_332.csv
Saved Cleaned_csvs/page_333.csv
Saved Cleaned_csvs/page_334.csv
Saved Cleaned_csvs/page_335.csv
Saved Cleaned_csvs/page_336.csv
Saved Cleaned_csvs/page_337.csv
Saved Cleaned_csvs/page_338.csv
Saved Cleaned_csvs/page_339.csv
Saved Cleaned_csvs/page_34.csv
Saved Cleaned_csvs/page_340.csv
Saved Cleaned_csvs/page_341.csv
Saved Cleaned_csvs/page_342.csv
Saved Cleaned_csvs/page_343.csv
Saved Cleaned_csvs/page_344.csv
Saved Cleaned_csvs/page_345.csv
Saved Cleaned_csvs/page_346.csv
Saved Cleaned_csvs/page_347.csv
Saved Cleaned_csvs/page_348.csv
Saved Cleaned_csvs/page_349.csv
Saved Cleaned_csvs/page_35.csv
Saved Cleaned_csvs/page_350.csv
Saved Cleaned_csvs/page_351.csv
Saved Cleaned_csvs/page_352.csv
Saved Cleaned_csvs/page_353.csv
Saved Cleaned_csvs/page_354.csv
Saved Cleaned_csvs/page_355.csv
Saved Cleaned_csvs/page_356.csv
Saved Clean

Saved Cleaned_csvs/page_560.csv
Saved Cleaned_csvs/page_561.csv
Saved Cleaned_csvs/page_562.csv
Saved Cleaned_csvs/page_563.csv
Saved Cleaned_csvs/page_564.csv
Saved Cleaned_csvs/page_565.csv
Saved Cleaned_csvs/page_566.csv
Saved Cleaned_csvs/page_567.csv
Saved Cleaned_csvs/page_568.csv
Saved Cleaned_csvs/page_569.csv
Saved Cleaned_csvs/page_57.csv
Saved Cleaned_csvs/page_570.csv
Saved Cleaned_csvs/page_571.csv
Saved Cleaned_csvs/page_572.csv
Saved Cleaned_csvs/page_573.csv
Saved Cleaned_csvs/page_574.csv
Saved Cleaned_csvs/page_575.csv
Saved Cleaned_csvs/page_576.csv
Saved Cleaned_csvs/page_577.csv
Saved Cleaned_csvs/page_578.csv
Saved Cleaned_csvs/page_579.csv
Saved Cleaned_csvs/page_58.csv
Saved Cleaned_csvs/page_580.csv
Saved Cleaned_csvs/page_581.csv
Saved Cleaned_csvs/page_582.csv
Saved Cleaned_csvs/page_583.csv
Saved Cleaned_csvs/page_584.csv
Saved Cleaned_csvs/page_585.csv
Saved Cleaned_csvs/page_586.csv
Saved Cleaned_csvs/page_587.csv
Saved Cleaned_csvs/page_588.csv
Saved Clea

Saved Cleaned_csvs/page_792.csv
Saved Cleaned_csvs/page_793.csv
Saved Cleaned_csvs/page_794.csv
Saved Cleaned_csvs/page_795.csv
Saved Cleaned_csvs/page_796.csv
Saved Cleaned_csvs/page_797.csv
Saved Cleaned_csvs/page_798.csv
Saved Cleaned_csvs/page_799.csv
Saved Cleaned_csvs/page_8.csv
Saved Cleaned_csvs/page_80.csv
Saved Cleaned_csvs/page_800.csv
Saved Cleaned_csvs/page_801.csv
Saved Cleaned_csvs/page_802.csv
Saved Cleaned_csvs/page_803.csv
Saved Cleaned_csvs/page_804.csv
Saved Cleaned_csvs/page_805.csv
Saved Cleaned_csvs/page_806.csv
Saved Cleaned_csvs/page_807.csv
Saved Cleaned_csvs/page_808.csv
Saved Cleaned_csvs/page_809.csv
Saved Cleaned_csvs/page_81.csv
Saved Cleaned_csvs/page_810.csv
Saved Cleaned_csvs/page_811.csv
Saved Cleaned_csvs/page_812.csv
Saved Cleaned_csvs/page_813.csv
Saved Cleaned_csvs/page_814.csv
Saved Cleaned_csvs/page_815.csv
Saved Cleaned_csvs/page_816.csv
Saved Cleaned_csvs/page_817.csv
Saved Cleaned_csvs/page_818.csv
Saved Cleaned_csvs/page_819.csv
Saved Cleane