In [10]:
import os
import requests
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import joblib

In [11]:
def create_data_folder():
    folder_name = 'scraped_data'
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    return folder_name

In [12]:
def save_data_to_csv(data, folder_name):
    csv_file_path = os.path.join(folder_name, 'scraped_data.csv')
    data.to_csv(csv_file_path, index=False)
    print(f"Scraped data saved to: {csv_file_path}")

In [13]:
def scrape_news(url, text_selector='p'):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        article_text = ' '.join([element.text for element in soup.select(text_selector)])

        return article_text
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None

In [14]:
def scrape_articles_in_section(base_url, section, max_articles=10):
    url = f'{base_url}/{section}'
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        article_urls = [a['href'] for a in soup.select('a.qa-heading-link')]

        articles = []
        for article_url in article_urls[:max_articles]:
            article_text = scrape_news(f'{base_url}{article_url}')
            if article_text:
                articles.append({'text': article_text, 'section': section})
        return articles
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return []

In [15]:
def train_and_evaluate_model(X_train, y_train, X_test, y_test, folder_name):
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    param_dist = {'alpha': [0.1, 0.5, 1.0, 2.0]}
    random_search = RandomizedSearchCV(MultinomialNB(), param_distributions=param_dist, n_iter=4, cv=5, scoring='accuracy', random_state=42)
    random_search.fit(X_train_tfidf, y_train)

    best_alpha = random_search.best_params_['alpha']
    print(f"Best alpha: {best_alpha}")

    model = MultinomialNB(alpha=best_alpha)
    model.fit(X_train_tfidf, y_train)

    model_file_path = os.path.join(folder_name, 'text_classification_model.joblib')
    joblib.dump(model, model_file_path)

    predictions = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    confusion_mat = confusion_matrix(y_test, predictions)

    results = pd.DataFrame({'Accuracy': [accuracy]})
    results.to_csv(os.path.join(folder_name, 'classification_results.csv'), index=False)

In [16]:
def main():
    base_url = 'https://www.bbc.com'
    sections = ['news/world', 'news/uk', 'news/business', 'news/technology', 'news/science_and_environment', 'news/health', 'news/education', 'news/entertainment_and_arts']

    data = {'text': [], 'section': []}

    for section in sections:
        articles = scrape_articles_in_section(base_url, section)
        data['text'].extend(article['text'] for article in articles)
        data['section'].extend(article['section'] for article in articles)

    df = pd.DataFrame(data)

    folder_name = create_data_folder()
    save_data_to_csv(df, folder_name)

    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['section'], test_size=0.2, random_state=42)

    train_and_evaluate_model(X_train, y_train, X_test, y_test, folder_name)

In [17]:
if __name__ == '__main__':
    main()

Scraped data saved to: scraped_data\scraped_data.csv
Best alpha: 0.1
