In [1]:
# Step 0: Import libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

# Chrome driver setup (replace path with your chromedriver location)
service = Service("chromedriver.exe")  # or full path if needed
driver = webdriver.Chrome(service=service)



In [3]:
# Step 1: Function to extract skills from a single internship card
def get_skills(internship):
    skills_container = internship.find("div", class_="job_skills")
    if not skills_container:
        return None
    skills = skills_container.find_all("div", class_="job_skill")
    skills_list = [s.get_text(strip=True) for s in skills]
    return ", ".join(skills_list)  # comma-separated string


In [5]:
# Step 2: Function to extract internship details
def extract_internship_data(internship):
    try:
        title = internship.find("a", class_="job-title-href").get_text(strip=True)
        company = internship.find("p", class_="company-name").get_text(strip=True)
        location = internship.find("div", class_="locations").get_text(" ", strip=True)
        stipend_tag = internship.find("span", class_="stipend")
        stipend = stipend_tag.get_text(strip=True) if stipend_tag else None
        duration_tag = internship.find("i", class_="ic-16-calendar")
        duration = duration_tag.find_next("span").get_text(strip=True) if duration_tag else None
        apply_link = "https://internshala.com" + internship.get("data-href", "")
        skills = get_skills(internship)
        
        return {
            "Title": title,
            "Company": company,
            "Location": location,
            "Stipend": stipend,
            "Duration": duration,
            "Skills": skills,
            "Apply Link": apply_link
        }
    except Exception as e:
        logging.error(f"Error extracting internship: {e}")
        return None


In [7]:
# Step 3: Function to scrape a single page of internships
def scrape_page(url):
    driver.get(url)
    time.sleep(3)  # wait for page to load

    soup = BeautifulSoup(driver.page_source, "html.parser")
    # Find all internship cards
    internships = soup.find_all("div", class_="individual_internship")
    # Filter real internships (skip promoted ads)
    real_internships = [i for i in internships if "jos_native_ad_text" not in i.get("class", [])]
    
    page_data = []
    for idx, internship in enumerate(real_internships, start=1):
        data = extract_internship_data(internship)
        if data:
            page_data.append(data)
            logging.info(f"Scraped internship #{idx}: {data['Title']}")
    
    return page_data


In [11]:
# Step 4: Scrape limited pages (e.g., first 12 pages)
all_internships = []
max_pages = 12  # set the maximum number of pages to scrape

for page in range(1, max_pages + 1):
    url = f"https://internshala.com/internships/python-internship?page={page}"
    logging.info(f"Scraping page {page}...")
    
    page_data = scrape_page(url)
    
    if not page_data:  # stop if no internships on page
        logging.info(f"No internships found on page {page}. Stopping early.")
        break
    
    all_internships.extend(page_data)


INFO:Scraping page 1...
INFO:Scraped internship #1: Video Editing/Making
INFO:Scraped internship #2: Artificial Intelligence (AI)
INFO:Scraped internship #3: AI Agent Development
INFO:Scraped internship #4: Blockchain Development
INFO:Scraped internship #5: Automation Testing
INFO:Scraped internship #6: Internet Of Things (IoT)
INFO:Scraped internship #7: Full Stack Development
INFO:Scraped internship #8: Software Engineer
INFO:Scraped internship #9: Python Development
INFO:Scraped internship #10: Signal Processing
ERROR:Error extracting internship: 'NoneType' object has no attribute 'get_text'
INFO:Scraped internship #12: Technical Doubt Support
INFO:Scraped internship #13: Android App Development
INFO:Scraped internship #14: Full Stack Development
INFO:Scraped internship #15: Python Development
INFO:Scraped internship #16: Machine Learning
INFO:Scraped internship #17: Business Development (Sales)
INFO:Scraped internship #18: Software Development
INFO:Scraped internship #19: Automatio

In [13]:
# Step 5: Convert to DataFrame and save
df = pd.DataFrame(all_internships)
df.to_csv("internshala_internships.csv", index=False)

# Make sure 'openpyxl' is installed for Excel
df.to_excel("internshala_internships.xlsx", index=False)

logging.info(f"Saved {len(df)} internships to CSV & Excel")


INFO:Saved 480 internships to CSV & Excel


In [15]:
# Step 6: Preview first 5 rows
df.head()


Unnamed: 0,Title,Company,Location,Stipend,Duration,Skills,Apply Link
0,Video Editing/Making,603 The CoWorking Space India,"Mumbai, Bandra","₹ 12,000 - 15,000 /month",3 Months,"JavaScript, Creativity, Python, Time Managemen...",https://internshala.com/internship/detail/vide...
1,Artificial Intelligence (AI),Quantum Value CFO Services,"Delhi, Faridabad, Gurgaon, Noida (Hybrid)","₹ 12,000 - 12,400 /month",3 Months,"Python, Data Structures, Machine Learning, Dat...",https://internshala.com/internship/detail/arti...
2,AI Agent Development,Hairdrama.com,Ahmedabad,"₹ 9,000 - 15,000 /month",3 Months,"Python, React, OpenAI API, Gemini",https://internshala.com/internship/detail/ai-a...
3,Blockchain Development,Monkhub,Gurgaon,"₹ 10,000 /month",6 Months,"Java, JavaScript, Python, .NET, Git, React, RE...",https://internshala.com/internship/detail/bloc...
4,Automation Testing,Think Future Technologies Private Limited,,"₹ 8,000 - 10,000 /month",6 Months,"Python, GitHub, Jenkins, Vibe Coding, Cursor (...",https://internshala.com/internship/detail/work...


In [17]:
# Step 7: Always close Selenium when done
driver.quit()
