In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Set up headers for the request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win 64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

# Initialize lists to hold the data
title_list = []
type_list = []
name_list = []
location_list = []
experience_list = []
salary_list = []

# Loop through the six pages
for page in range(1, 7):  # Page numbers 1 to 6
    url = f"https://internshala.com/jobs/data-science-jobs/page-{page}/"

    # Request the webpage
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        webpage = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage for page {page}: {e}")
        continue  # Skip to the next page if there's an error

    # Parse the webpage content
    soup = BeautifulSoup(webpage, 'lxml')
    internships = soup.find_all("div", class_="individual_internship")

    # Extract information for each internship
    for internship in internships:
        # Extracting the company name
        company_div = internship.find('p', class_='company-name')
        name_list.append(company_div.text.strip() if company_div else "No data")

        # Extracting the location
        location_div = internship.find('p', class_='locations')
        location = location_div.find('a').text.strip() if location_div and location_div.find('a') else "No data"
        location_list.append(location)

        # Extracting the experience required
        experience_div = internship.find('div', class_='row-1-item')
        experience_list.append(experience_div.text.strip() if experience_div else "No data")

        # Extracting the internship type
        type_div = internship.find('div', class_='status-li')
        type_list.append(type_div.text.strip() if type_div else "No data")

        # Extracting the title of the internship
        title_div = internship.find('h3', class_='job-internship-name')
        title_list.append(title_div.text.strip() if title_div else "No data")

        # Extracting the salary information
        salary_div = internship.find('span', class_='desktop')
        salary_list.append(salary_div.text.strip() if salary_div else "No data")

# Create a DataFrame to store the extracted data
df = pd.DataFrame({
    "Title": title_list,
    "Type": type_list,
    "Company": name_list,
    "CTC": salary_list,
    "Location": location_list,
    "Exp": experience_list
})

# Display the DataFrame
df = df.drop_duplicates(keep = "first")
df

Unnamed: 0,Title,Type,Company,CTC,Location,Exp
0,Business Analyst,Fresher Job,Express Rupya,"₹ 2,00,000 - 2,10,000",Mumbai,0-2 years
1,Business Analyst,No data,Daltech Engineering Private Limited,"₹ 2,00,000",Ahmedabad,1-5 years
2,Data Operations Associate (NLP/AI),Fresher Job,Digit88,"₹ 2,00,000 - 2,50,000",Work from home,0-1 years
3,Artificial Intelligence (AI) Associate,International,"Diffblue (St Louis, United States)","$ 4,200 - 5,100",Work from home,0-5 years
4,Business Analysis,Part time,Saurabh Enterprise,"₹ 4,00,000 - 5,00,000",Work from home,1 year
...,...,...,...,...,...,...
234,Senior Computer Vision Engineer,No data,JTek Software Solutions Pvt Ltd,Competitive salary,Hyderabad,7-10 years
236,Web Developer UIUX,No data,JTek Software Solutions Pvt Ltd,Competitive salary,Hyderabad,7-10 years
237,Senior Backend Engineer,No data,JTek Software Solutions Pvt Ltd,Competitive salary,Bangalore,7-10 years
238,Cloud Backend Engineer,No data,JTek Software Solutions Pvt Ltd,Competitive salary,Bangalore,4-6 years


In [6]:
df.to_csv("Internshala Data_Science.csv")