# Collect data from Wikipedia

The dataset collected by this script via WikiTopicsCollector consists of Wikipedia pageviews and metadata for various topics related to artificial intelligence, machine learning, and related fields.

The data is gathered through the Wikimedia API, which provides access to page metadata, links to related topics, and daily pageview statistics over the past ten years.

The collection process starts with a predefined list of seed topics, which are expanded recursively by following Wikipedia’s internal links. The seed topics include two main categories:



*   **AI & Data Science Domains:** Examples include Artificial Intelligence, Machine Learning, Deep Learning, Computer Vision, Natural Language Processing, Reinforcement Learning, and Data Science.
*   **AI Models & Techniques:** Examples include ResNet, YOLO, BERT, GPT-3, AlphaGo, PPO, GANs, Stable Diffusion, Wav2Vec, and Graph Neural Networks.

For each topic, metadata is collected, and daily pageviews are retrieved. The collected metadata includes:


* article: The Wikipedia page title.
* date: The date of recorded pageviews in YYYYMMDD format.
* views: The number of pageviews for that day.
* weekday: The day of the week corresponding to the date.
* page_id: A unique identifier for the Wikipedia page.
* creation_date: The date when the Wikipedia page was first created.
* last_modified: The last time the page was updated.
* page_url: The direct link to the Wikipedia page.
* description: A short text snippet extracted from the page’s introduction.
* category: The first Wikipedia category assigned to the page.
* subcategory: The second Wikipedia category (if available).

The data collection process ensures that each topic’s evolution and popularity trends can be analyzed over time.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import os
import time
from collections import defaultdict

class WikiTopicsCollector:
    def __init__(self, drive_path=None):
        print("Initializing WikiTopicsCollector...")
        self.base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews"
        self.wiki_api_url = "https://en.wikipedia.org/w/api.php"
        self.headers = {
            'User-Agent': 'TopicsCollector/1.0 (your@email.com)'
        }

        # Mount Google Drive and set up save path
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Successfully mounted Google Drive")
        except Exception as e:
            print(f"Error mounting drive: {e}")

        self.save_path = drive_path or "/content/drive/MyDrive/AITrendAnalysis-project/"
        self.filename = "wikipedia_data_with_categories.csv"  # Updated filename
        self.filepath = os.path.join(self.save_path, self.filename)
        print(f"Will save data to: {self.filepath}")

        os.makedirs(self.save_path, exist_ok=True)
        print("Save directory confirmed/created")

    def discover_related_topics(self, seed_topics, max_depth=3):
        print(f"\nStarting topic discovery with {len(seed_topics)} seed topics...")
        all_topics = set(seed_topics)
        topic_metadata = defaultdict(dict)
        topics_to_process = set(seed_topics)
        processed_topics = set()
        data_buffer = []

        for depth in range(max_depth):
            print(f"\nExploring depth {depth + 1}/{max_depth}")
            new_topics = set()

            for topic in topics_to_process:
                if topic in processed_topics:
                    continue

                print(f"Processing topic: {topic}")

                try:
                    params = {
                        "action": "query",
                        "format": "json",
                        "prop": "links|categories|info|extracts",
                        "titles": topic.replace("_", " "),
                        "pllimit": "500",
                        "cllimit": "500",
                        "inprop": "url|created",
                        "exintro": True,
                        "explaintext": True
                    }

                    response = requests.get(self.wiki_api_url, params=params)
                    data = response.json()
                    page = next(iter(data['query']['pages'].values()))

                    if 'pageid' in page:
                        metadata = {
                            'page_id': page['pageid'],
                            'creation_date': page.get('created'),
                            'last_modified': page.get('touched'),
                            'page_url': page.get('fullurl'),
                            'description': page.get('extract', '')[:500],
                            'categories': [cat['title'] for cat in page.get('categories', [])]  # Extract categories
                        }

                        pageview_data = self._get_pageviews(topic, metadata)
                        if pageview_data:
                            data_buffer.extend(pageview_data)

                            if len(data_buffer) >= 100:  # Save every 100 records
                                self._save_to_csv(data_buffer)
                                print(f"Saved batch of {len(data_buffer)} records")
                                data_buffer = []

                        # Collect related topics
                        if 'links' in page:
                            new_topics.update(link['title'] for link in page['links'])

                    processed_topics.add(topic)
                    time.sleep(1)

                except Exception as e:
                    print(f"Error processing {topic}: {e}")
                    continue

            all_topics.update(topics_to_process)
            topics_to_process = new_topics - processed_topics
            print(f"Depth {depth + 1} complete. Found {len(new_topics)} new topics")

            if data_buffer:
                self._save_to_csv(data_buffer)
                print(f"Saved remaining {len(data_buffer)} records")
                data_buffer = []

            if len(all_topics) > 1000:
                print("Reached maximum topic limit")
                break

        return list(all_topics), topic_metadata

    def _get_pageviews(self, article, metadata):
        """Get pageviews for a single article"""
        try:
            start_date, end_date = self._get_date_range(3650)
            url = f"{self.base_url}/per-article/en.wikipedia.org/all-access/user/{article}/daily/{start_date}/{end_date}"

            response = requests.get(url, headers=self.headers)
            data = response.json()

            if 'items' not in data:
                return []

            return [{
                'article': article,
                'date': item['timestamp'],
                'views': item['views'],
                'weekday': datetime.strptime(item['timestamp'], '%Y%m%d00').strftime('%A'),
                'page_id': metadata.get('page_id', ''),
                'creation_date': metadata.get('creation_date', ''),
                'last_modified': metadata.get('last_modified', ''),
                'page_url': metadata.get('page_url', ''),
                'description': metadata.get('description', ''),
                'category': metadata.get('categories', [])[0] if metadata.get('categories') else '',  # Add category
                'subcategory': metadata.get('categories', [])[1] if len(metadata.get('categories', [])) > 1 else ''  # Add subcategory
            } for item in data['items']]

        except Exception as e:
            print(f"Error getting pageviews for {article}: {e}")
            return []

    def _get_date_range(self, days_back):
        """Calculate start and end dates"""
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)
        return start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')

    def _save_to_csv(self, data):
        """Save data to single CSV file"""
        if not data:
            return

        df = pd.DataFrame(data)
        try:
            if os.path.exists(self.filepath):
                df.to_csv(self.filepath, mode='a', header=False, index=False)
            else:
                df.to_csv(self.filepath, index=False)
            print(f"Successfully saved {len(data)} rows to {self.filepath}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")
            fallback_path = os.path.join(os.getcwd(), self.filename)
            print(f"Attempting to save to fallback location: {fallback_path}")
            df.to_csv(fallback_path, index=False)

def main():
    collector = WikiTopicsCollector("/content/drive/MyDrive/AITrendAnalysis-project/")
    ds_domains = [
        'Artificial_intelligence', 'Machine_learning',
        'Deep_learning', 'Natural_language_processing',
        'Computer_vision', 'Neural_network',
        'Large_language_model', 'Reinforcement_learning',
        'Artificial_general_intelligence', 'Computational_linguistics',
        'Robotics', 'Computer_science',
        'Data_science', 'Automation',
        'Technology', 'Innovation',
        'Big_data', 'Data_mining',
        'Supervised_learning', 'Unsupervised_learning',
        'Causal_inference', 'AutoML',
        'Generative_AI', 'Model_interpretability',
        'Data_visualization'
        ]

    ai_models = [
    # Computer Vision Models
    "ResNet",
    "YOLO (You Only Look Once)",
    "EfficientNet",
    "Mask R-CNN",
    "Inception",
    "VGGNet",
    "U-Net",
    "Swin Transformer",
    "DeepLabV3",
    "MobileNet",

    # Natural Language Processing (NLP) Models
    "BERT (Bidirectional Encoder Representations from Transformers)",
    "GPT-3 / GPT-4 (Generative Pre-trained Transformers)",
    "T5 (Text-to-Text Transfer Transformer)",
    "RoBERTa",
    "XLNet",
    "DistilBERT",
    "ALBERT",
    "BART (Bidirectional and Auto-Regressive Transformers)",
    "OpenAI Codex",
    "LaMDA (Language Model for Dialogue Applications)",

    # Reinforcement Learning (RL) Models
    "Deep Q-Network (DQN)",
    "AlphaGo",
    "AlphaZero",
    "A3C (Asynchronous Advantage Actor-Critic)",
    "PPO (Proximal Policy Optimization)",
    "TD3 (Twin Delayed Deep Deterministic Policy Gradient)",
    "SAC (Soft Actor-Critic)",
    "DeepMind's MuZero",

    # General Machine Learning (ML) Models
    "Random Forest",
    "Gradient Boosting Machines (GBM)",
    "Support Vector Machine (SVM)",
    "Logistic Regression",
    "K-Nearest Neighbors (KNN)",
    "Naive Bayes",
    "K-Means",
    "AdaBoost",
    "Linear/Polynomial Regression",

    # Generative Models
    "GANs (Generative Adversarial Networks)",
    "BigGAN",
    "VAE (Variational Autoencoders)",
    "StyleGAN",
    "DALL·E",
    "CLIP",
    "Stable Diffusion",

    # Speech and Audio Models
    "OpenAI Whisper",
    "DeepSpeech",
    "Tacotron 2",
    "Wav2Vec",
    "Vocoder",

    # Graph Neural Networks (GNN)
    "GCN (Graph Convolutional Networks)",
    "GAT (Graph Attention Networks)",
    "GraphSAGE"]

    seed_topics = ds_domains + ai_models

    print("Starting topic discovery and data collection...")
    all_topics, topic_metadata = collector.discover_related_topics(seed_topics, max_depth=2)
    print("Process complete!")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Successfully saved 3507 rows to /content/drive/MyDrive/AITrendAnalysis-project/wikipedia_data_with_categories.csv
Saved batch of 3507 records
Processing topic: Fooocus
Successfully saved 228 rows to /content/drive/MyDrive/AITrendAnalysis-project/wikipedia_data_with_categories.csv
Saved batch of 228 records
Processing topic: Computer (magazine)
Successfully saved 3524 rows to /content/drive/MyDrive/AITrendAnalysis-project/wikipedia_data_with_categories.csv
Saved batch of 3524 records
Processing topic: Fiber crop
Successfully saved 3524 rows to /content/drive/MyDrive/AITrendAnalysis-project/wikipedia_data_with_categories.csv
Saved batch of 3524 records
Processing topic: Ethnomethodology
Successfully saved 3524 rows to /content/drive/MyDrive/AITrendAnalysis-project/wikipedia_data_with_categories.csv
Saved batch of 3524 records
Processing topic: Electric power industry
Successfully saved 3524 rows to /content/drive/MyDrive/AI