In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import webbrowser

In [2]:
# Function to extract Job title
def get_jobtitle(new_soup):
    try:
        jobtitle = new_soup.find("h1", class_='font-size-24')
        jobtitle_value = jobtitle.text.strip()
        jobtitle_string = jobtitle_value
    except AttributeError:
        jobtitle_string = "No Job Title"
    return jobtitle_string

# Function to extract skills
def extract_skills(new_soup):
    skills = []
    skill_list = new_soup.find('ul', class_='keyskills_keySkills_items__ej9_3')
    
    if skill_list:
        for skill_item in skill_list.find_all('li'):
            skill = skill_item.text.strip()
            skills.append(skill)
    
    return skills

base_url = 'https://www.shine.com/job-search/manager-jobs?q=manager'
d = {
    'jobtitle': [],
    'skills': [],
}

for page_number in range(1, 70):  # Adjust the range based on the number of pages you want to scrape
    url = f'{base_url}&page={page_number}'
    webpage = requests.get(url)

    # Check if the request was successful (status code 200)
    if webpage.status_code == 200:
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        links = soup.find_all("h2", attrs={'itemprop': 'name'})
        links_list = [link.find('a').get('href') for link in links if link.find('a')]
        
        for link in links_list:
            if link is not None:
                complete_url = f"https://www.shine.com{link}"
                new_webpage = requests.get(complete_url)
                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                jobtitle = get_jobtitle(new_soup)
                skills = extract_skills(new_soup)

                # Add conditional checks
                if jobtitle:
                    d['jobtitle'].append(jobtitle)
                else:
                    d['jobtitle'].append("No Job Title")  # Placeholder value

                if skills:
                    d['skills'].append(skills)
                else:
                    d['skills'].append([])

    else:
        print(f"Failed to retrieve the webpage. Status code: {webpage.status_code}")

# Create DataFrame
linkjd_df = pd.DataFrame.from_dict(d)

# Replace empty strings in 'jobtitle' with NaN
linkjd_df['jobtitle'].replace('', np.nan, inplace=True)

# Drop rows with NaN values in 'jobtitle'
linkjd_df = linkjd_df.dropna(subset=['jobtitle'])

# Save DataFrame to CSV file
linkjd_df.to_csv("12310066FPshine_data.csv", header=True, index=False)

print("DataFrame created and saved to CSV successfully.")


DataFrame created and saved to CSV successfully.


In [3]:
linkjd_df

Unnamed: 0,jobtitle,skills
0,Agency Manager Senior Agency Manager,"[insurance, sales, recruitment, agency develop..."
1,Leading role for Business Development Manager ...,"[agent recruitment, agency sales, field sales,..."
2,Business Development Manager,"[direct sales, sales, insurance sales, insuran..."
3,Branch Manager (Location KONASEEMA),"[Customer Service, Retail Banking, Sales, Team..."
4,Branch Operations Manager (Location Mulugu),"[Retail Banking, Branch Operations, Customer S..."
...,...,...
1375,Territory Sales Manager,"[territory, sales, territory sales, territory ..."
1376,Assistant Area Sales Manager/ Area Sales Manager,"[cashiers, sales coordination, account coordin..."
1377,Content Marketing Manager / General Manager,"[marketing communication, marketing management..."
1378,Content Marketing Manager / General Manager,"[marketing communication, marketing management..."
