In [10]:
# ==============================================================================
# Automated Web Scraping & Text Analysis Pipeline
# Target: arXiv.org (Academic Papers on Chatbots & Education)
# Author: Robin Masson
# ==============================================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_arxiv_chatbots_education():
    """
    Scrapes the most recent academic papers on arXiv related to Chatbots in Education.
    Demonstrates reproducible web scraping and text data extraction.
    """
    print("Initiating scraping pipeline for arXiv...")

    # URL de recherche ciblée : Chatbots + Education
    url = 'https://arxiv.org/search/cs?query="chatbot"+AND+"education"&searchtype=all&abstracts=show&order=-announced_date_first&size=25'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    papers_data = []

    # Extraction des données via le DOM
    results = soup.find_all('li', class_='arxiv-result')

    for result in results:
        try:
            title = result.find('p', class_='title').text.strip()

            authors = result.find('p', class_='authors').text.replace('Authors:', '').strip()

            abstract_span = result.find('span', class_='abstract-full')
            if abstract_span.find('a'):
                abstract_span.find('a').decompose()
            abstract = abstract_span.text.strip()

            papers_data.append({
                'title': title,
                'authors': authors,
                'abstract': abstract
            })
        except AttributeError:
            continue

    print(f"Successfully scraped {len(papers_data)} academic papers.")
    return papers_data

data = scrape_arxiv_chatbots_education()

df = pd.DataFrame(data)

if not df.empty:
    print("\nPerforming basic text analysis (Complexity & Length)...")
    df['abstract_length'] = df['abstract'].apply(len)

    df['word_count'] = df['abstract'].apply(lambda x: len(x.split()))

    json_filename = 'chatbot_education_data.json'
    df.to_json(json_filename, orient='records', indent=4)
    print(f"Data successfully exported to JSON: {json_filename}")

    display(df[['title', 'word_count', 'abstract_length']].head())
else:
    print("No data found.")

Initiating scraping pipeline for arXiv...
Successfully scraped 25 academic papers.

Performing basic text analysis (Complexity & Length)...
Data successfully exported to JSON: chatbot_education_data.json


Unnamed: 0,title,word_count,abstract_length
0,"""How Do I ...?"": Procedural Questions Predomin...",225,1581
1,"Games That Teach, Chats That Convince: Compari...",172,1289
2,RelianceScope: An Analytical Framework for Exa...,245,1850
3,Search in Transition: A Study of University St...,194,1486
4,Open TutorAI: An Open-source Platform for Pers...,228,1844
