In [1]:
import requests
import json
import sqlite3
from datetime import datetime

In [2]:
def setup_database():
    conn = sqlite3.connect("live_wikipedia.db")
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS wiki_edits (
            timestamp TEXT,
            user_name TEXT,
            page_title TEXT,
            is_bot BOOLEAN,
            edit_length_change INTEGER,
            wikipedia_domain TEXT
        )
    ''')
    conn.commit()
    return conn

In [3]:
def stream_wikipedia_data():
    print("üîå Connecting to Wikipedia's Live Data Firehose...", flush=True)
    
    url = 'https://stream.wikimedia.org/v2/stream/recentchange'
    
    headers = {
        'User-Agent': 'DataEngineeringPoC/1.0 (Python/requests)'
    }
    
    conn = setup_database()
    cursor = conn.cursor()
    
    print("üì° Connected! Listening for live global edits...\n", flush=True)
    print("-" * 70, flush=True)
    
    try:
        with requests.get(url, headers=headers, stream=True, timeout=None) as response:
            
            print(f"HTTP Status Code: {response.status_code} (Should be 200)", flush=True)
            
            if response.status_code != 200:
                print("‚ùå Connection refused by Wikipedia. Stopping.")
                return

            for line in response.iter_lines(chunk_size=1):
                if line:
                    decoded_line = line.decode('utf-8')
                    
                    if decoded_line.startswith('data: '):
                        try:
                            json_data = json.loads(decoded_line[6:])
                            
                            if json_data.get('type') == 'edit' and json_data.get('namespace') == 0:
                                
                                user = json_data.get('user', 'Unknown')
                                title = json_data.get('title', 'Unknown')
                                is_bot = json_data.get('bot', False)
                                domain = json_data.get('meta', {}).get('domain', 'Unknown')
                                
                                old_len = json_data.get('length', {}).get('old', 0)
                                new_len = json_data.get('length', {}).get('new', 0)
                                length_diff = new_len - old_len
                                
                                current_time = datetime.now().strftime("%H:%M:%S")
                                
                                cursor.execute('''
                                    INSERT INTO wiki_edits (timestamp, user_name, page_title, is_bot, edit_length_change, wikipedia_domain)
                                    VALUES (?, ?, ?, ?, ?, ?)
                                ''', (current_time, user, title, is_bot, length_diff, domain))
                                conn.commit()
                                
                                bot_status = "ü§ñ BOT" if is_bot else "üë§ HUMAN"
                                print(f"[{current_time}] {bot_status} edited: '{title}' ({length_diff} chars) on {domain}", flush=True)
                                
                        except json.JSONDecodeError:
                            continue 
                            
    except KeyboardInterrupt:
        print("\nüõë Pipeline stopped by user. Closing database connection.", flush=True)
        conn.close()
    except Exception as e:
        print(f"\n‚ùå Pipeline crashed: {e}", flush=True)
        conn.close()

        
if __name__ == "__main__":
    stream_wikipedia_data()

üîå Connecting to Wikipedia's Live Data Firehose...
üì° Connected! Listening for live global edits...

----------------------------------------------------------------------
HTTP Status Code: 200 (Should be 200)
[17:46:31] üë§ HUMAN edited: 'Jerzy Duracz' (6 chars) on pl.wikipedia.org
[17:46:31] üë§ HUMAN edited: 'Q4129770' (71 chars) on www.wikidata.org
[17:46:31] üë§ HUMAN edited: 'Q137326281' (1071 chars) on www.wikidata.org
[17:46:31] ü§ñ BOT edited: 'Q124201674' (-2 chars) on www.wikidata.org
[17:46:31] üë§ HUMAN edited: 'Simone Consonni' (-16 chars) on de.wikipedia.org
[17:46:31] üë§ HUMAN edited: 'Q4129770' (13 chars) on www.wikidata.org
[17:46:32] ü§ñ BOT edited: 'Richard Thomas' (11 chars) on pt.wikipedia.org
[17:46:32] üë§ HUMAN edited: 'Q4129770' (20 chars) on www.wikidata.org
[17:46:32] üë§ HUMAN edited: 'Q137132465' (1071 chars) on www.wikidata.org
[17:46:32] üë§ HUMAN edited: 'List fan Fryske bierbrouwerijen' (0 chars) on fy.wikipedia.org
[17:46:32] üë§ HUMAN

In [4]:
import sqlite3
import pandas as pd

def analyze_live_data():
    print("üìä Analyzing Wikipedia Stream Data...\n")
    
    conn = sqlite3.connect("live_wikipedia.db")
    
    total_edits = pd.read_sql_query("SELECT COUNT(*) as Total_Edits FROM wiki_edits", conn)
    print(f"Total Edits Captured: {total_edits['Total_Edits'][0]}")
    print("-" * 40)
    
    bot_ratio = pd.read_sql_query('''
        SELECT 
            CASE WHEN is_bot = 1 THEN 'Bots ü§ñ' ELSE 'Humans üë§' END as User_Type,
            COUNT(*) as Edit_Count
        FROM wiki_edits
        GROUP BY is_bot
    ''', conn)
    print("Who is making the edits?")
    print(bot_ratio.to_string(index=False))
    print("-" * 40)
    

    top_pages = pd.read_sql_query('''
        SELECT page_title, SUM(ABS(edit_length_change)) as Total_Chars_Changed
        FROM wiki_edits
        GROUP BY page_title
        ORDER BY Total_Chars_Changed DESC
        LIMIT 5
    ''', conn)
    
    print("Top 5 Most Heavily Modified Pages:")
    print(top_pages.to_string(index=False))
    
    conn.close()

# Run the analysis
analyze_live_data()

üìä Analyzing Wikipedia Stream Data...

Total Edits Captured: 3870
----------------------------------------
Who is making the edits?
User_Type  Edit_Count
 Humans üë§        3176
   Bots ü§ñ         694
----------------------------------------
Top 5 Most Heavily Modified Pages:
                               page_title  Total_Chars_Changed
                                  America                74784
                     ‡¶ï‡ßá‡¶≤‡¶æ‡¶∏‡¶¨‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶® ‡¶°‡ßá‡¶ü‡¶æ‡¶¨‡ßá‡¶∏                59878
                                   ÿ≠ÿ∂ÿ±ŸÖŸàÿ™                53943
                               Q138481572                34885
2022 Punjab Legislative Assembly election                23227
