In [1]:
import requests  # type: ignore
from bs4 import BeautifulSoup # type: ignore
from urllib.parse import urljoin
import warnings
warnings.filterwarnings("ignore")

In [2]:
def scrape_and_get_links(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            button_group_section = soup.find('section', class_='section__button-group')
            if button_group_section:
                link = button_group_section.find('a')
                if link and link.has_attr('href'):
                    href = link['href']
                    full_url = urljoin(url, href)
                    return full_url
                else:
                    print(f"No link found in the section on {url}.")
            else:
                print(f"Section with class 'section__button-group' not found on {url}.")
        else:
            print(f"Failed to retrieve the page {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {url}: {e}")


In [3]:
def extract_courses(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    courses = []
    course_cards = soup.find_all('li', class_='products__list-item')

    for card in course_cards:
        title_tag = card.find('h3')
        lessons_tag = card.find('span', class_='course-card__lesson-count')
        price_tag = card.find('span', class_='course-card__price')
        image_tag = card.find('img', class_='course-card__img')
        href_tag = card.find('a', class_='course-card')['href']
        ratings_count = card.find('span', class_='review__stars-count')
        if title_tag and lessons_tag and price_tag and image_tag and href_tag:
            course = {
                "title": title_tag.get_text(strip=True),
                "lessons": lessons_tag.get_text(strip=True),
                "price": price_tag.get_text(strip=True),
                "ratings": int(ratings_count.text.strip('()')) if ratings_count else 0,
                "image_url": image_tag['src'],
                "course_url": urljoin("https://courses.analyticsvidhya.com", href_tag)
            }
            courses.append(course)
    
    return courses


In [4]:
def scrape_all_courses(view_more_url):
    all_courses = []
    page = 1
    pagination_pattern = f'{view_more_url}?page={{}}'

    while True:
        if page == 1:
            url = view_more_url
        else:
            url = pagination_pattern.format(page)
        
        print(f"Scraping page {page}: {url}")
        page_courses = extract_courses(url)
        
        if not page_courses:
            break
        
        all_courses.extend(page_courses)
        page += 1
    
    return all_courses

In [5]:
def scrape_course_description(course_url):
    response = requests.get(course_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        description_div = soup.find('div', class_='rich-text__container')
        if description_div:
            return description_div.get_text(strip=True)
    return "Description not available."

In [6]:
import requests
import csv
import json 

In [7]:
def main():
    base_url = "https://courses.analyticsvidhya.com/pages/all-free-courses"
    view_more_url = scrape_and_get_links(base_url)
    
    if view_more_url:
        all_courses = scrape_all_courses(view_more_url)
        free_courses = [course for course in all_courses if course['price'] == 'Free']
        
        for course in free_courses:
            course['description'] = scrape_course_description(course['course_url'])
        
        # Store the data in a CSV file
        with open('Freecourses.csv', 'w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=free_courses[0].keys())
            writer.writeheader()
            for course in free_courses:
                writer.writerow(course)
        
        # Store the data in a JSON file
        with open('Freecourses.json', 'w') as file:
            json.dump(free_courses, file, indent=4)
        
        return free_courses
    else:
        print("No view more URL found.")


In [8]:
if __name__ == "__main__":
    free_courses = main()

Scraping page 1: https://courses.analyticsvidhya.com/collections
Scraping page 2: https://courses.analyticsvidhya.com/collections?page=2
Scraping page 3: https://courses.analyticsvidhya.com/collections?page=3
Scraping page 4: https://courses.analyticsvidhya.com/collections?page=4
Scraping page 5: https://courses.analyticsvidhya.com/collections?page=5
Scraping page 6: https://courses.analyticsvidhya.com/collections?page=6
Scraping page 7: https://courses.analyticsvidhya.com/collections?page=7
Scraping page 8: https://courses.analyticsvidhya.com/collections?page=8
Scraping page 9: https://courses.analyticsvidhya.com/collections?page=9


In [9]:
import pandas as pd # type: ignore
df = pd.read_csv('Freecourses.csv',encoding='latin1')
df.head()

Unnamed: 0,title,lessons,price,ratings,image_url,course_url,description
0,Agentic AI Pioneer Program - Launching Soon!,1 Lessons,Free,0,https://import.cdn.thinkific.com/118220/oJYYBF...,https://courses.analyticsvidhya.com/courses/ag...,Description not available.
1,Creating Problem-Solving Agents using GenAI fo...,6 Lessons,Free,0,https://import.cdn.thinkific.com/118220/ih6JsB...,https://courses.analyticsvidhya.com/courses/cr...,Course DescriptionThis introductory course pro...
2,Improving Real World RAG Systems: Key Challeng...,12 Lessons,Free,0,https://import.cdn.thinkific.com/118220/70WfAJ...,https://courses.analyticsvidhya.com/courses/im...,Course DescriptionThis course explores the key...
3,Framework to Choose the Right LLM for your Bus...,6 Lessons,Free,0,https://import.cdn.thinkific.com/118220/A3lVcQ...,https://courses.analyticsvidhya.com/courses/ch...,Course DescriptionThis course will guide you t...
4,Building Smarter LLMs with Mamba and State Spa...,14 Lessons,Free,0,https://import.cdn.thinkific.com/118220/FbdKUq...,https://courses.analyticsvidhya.com/courses/SS...,Course DescriptionUnlock the Power of State Sp...
