In [None]:
# SCRAPPING




import requests
from bs4 import BeautifulSoup
import csv
import time

# Base configuration
base_url = "https://internshala.com"
page_url_template = base_url + "/jobs/matching-preferences/page-{}"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
output_file = 'internshala_jobs_with_skills.csv'

# Function to extract skills from job detail page
def get_job_skills(job_detail_url):
    try:
        res = requests.get(job_detail_url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        skills_section = soup.find('div', class_='round_tabs_container')
        if not skills_section:
            return 'N/A'
        skills = [s.text.strip() for s in skills_section.find_all('span', class_='round_tabs')]
        return ', '.join(skills) if skills else 'N/A'
    except Exception as e:
        print("⚠️ Error fetching skills:", e)
        return 'N/A'

# Start scraping and saving to CSV
with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Job Title', 'Company Name', 'Location', 'Salary', 'Work From Home', 'Skills'])

    for page_num in range(1, 44):  # Scraping pages 1 to 3
        print(f"\n🔄 Scraping Page {page_num}")
        url = f"https://internshala.com/jobs/net-development,ai-agent-development,asp-net,android-app-development,angular-js-development,animation,artificial-intelligence-ai,backend-development,big-data,cloud-computing,computer-science,data-science,database-building,front-end-development,full-stack-development,game-design,game-development,graphic-design,information-technology,internet-of-things-iot,java,javascript-development,machine-learning,mobile-app-development,network-engineering,node-js-development,php-development,prompt-engineering,python-django,robotics,software-development,ui-ux,web-development,ios-app-development,cybersecurity-jobs/page-{page_num}"
        print(f"🔗 Visiting: {url}")
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        jobs = soup.find_all('div', class_='individual_internship')

        for index, job in enumerate(jobs, start=1):
            try:
                # Title and Job Detail Link
                title_tag = job.find('a', class_='job-title-href')
                title = title_tag.text.strip() if title_tag else 'N/A'
                detail_url = base_url + title_tag['href'] if title_tag and title_tag.get('href') else 'N/A'

                # Company
                company_tag = job.find('p', class_='company-name')
                company = company_tag.text.strip() if company_tag else 'N/A'

                # Location
                location_tag = job.find('p', class_='locations')
                location = location_tag.text.strip() if location_tag else 'N/A'

                # Salary
                salary_tag = job.find('span', class_='mobile')
                salary = salary_tag.text.strip() if salary_tag else 'N/A'

                # Work From Home
                work_from_home = 'Yes' if 'home' in location.lower() else 'No'

                # Progress Print
                print(f"   🔧 [{index}/{len(jobs)}] Job: {title} | Company: {company}")

                # Skill extraction
                skills = get_job_skills(detail_url)

                # Write to CSV
                writer.writerow([title, company, location, salary, work_from_home, skills])

                time.sleep(0.5)  # delay for each job detail

            except Exception as e:
                print("⚠️ Error processing job:", e)

        print(f"✅ Finished scraping Page {page_num}")
        time.sleep(1.5)  # delay between pages

print(f"\n🎉 All data (including skills) saved to '{output_file}'")


In [5]:
# Cleaned



import pandas as pd

# Load the file (assuming it's a CSV with .xls extension)
file_path = "internshala_jobs_with_skills.csv"
df = pd.read_csv(file_path)

# Remove rows with either:
# 1. 'N/A' as a string
# 2. Empty or missing values (NaN)
df_cleaned = df.dropna()                         # Remove rows with any NaN
df_cleaned = df_cleaned[~df_cleaned.isin(['N/A']).any(axis=1)]  # Remove rows with string 'N/A'

# Save cleaned data
df_cleaned.to_csv("internshala_jobs_fully_cleaned.csv", index=False)

print("✅ All rows with 'N/A' or empty values removed. Cleaned file saved.")
