In [2]:
import requests
import pandas as pd
import time

def get_simple_revision_data(article_title, max_revs=100):
    """
    Simple function to get Wikipedia revision history for an article.
    """
    # Format article title for API
    title = article_title.replace(' ', '_')
    
    # API endpoint
    api_url = "https://en.wikipedia.org/w/api.php"
    
    # Basic parameters
    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvprop": "ids|timestamp|user|comment|size",
        "rvlimit": max_revs
    }
    
    # Make request
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Extract page ID
    page_id = list(data['query']['pages'].keys())[0]
    
    # Get revisions
    revisions = []
    if 'revisions' in data['query']['pages'][page_id]:
        revisions = data['query']['pages'][page_id]['revisions']
    
    # Convert to DataFrame
    rev_data = []
    for i, rev in enumerate(revisions):
        # Calculate size difference if possible
        diff_size = None
        if i < len(revisions) - 1:
            diff_size = rev.get('size', 0) - revisions[i+1].get('size', 0)
        
        rev_data.append({
            'revid': rev['revid'],
            'user': rev.get('user', 'Anonymous'),
            'timestamp': rev.get('timestamp'),
            'comment': rev.get('comment', ''),
            'size': rev.get('size', 0),
            'diff_size': diff_size,
            'article': article_title,
            # Simple vandalism heuristic
            'possible_vandalism': 'revert' in rev.get('comment', '').lower() or 
                                  'vandal' in rev.get('comment', '').lower() or
                                  abs(diff_size if diff_size else 0) > 1000
        })
    
    return pd.DataFrame(rev_data)

def get_all_articles_data(articles_list, revs_per_article=100):
    """
    Get revision data for multiple articles and combine them.
    """
    all_data = []
    
    for article in articles_list:
        print(f"Getting revision data for '{article}'...")
        try:
            article_df = get_simple_revision_data(article, max_revs=revs_per_article)
            all_data.append(article_df)
            # Be nice to Wikipedia's servers
            time.sleep(1)
        except Exception as e:
            print(f"Error processing '{article}': {str(e)}")
    
    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

# Example usage
if __name__ == "__main__":
    # List of articles known for vandalism issues
    vandalism_prone_articles = [
        # Controversial political figures
        "Donald Trump",
        "Joe Biden",
        "Hillary Clinton",
        "Barack Obama",
        "Vladimir Putin",
        
        # Controversial topics
        "Climate change",
        "COVID-19",
        "Abortion",
        "Gun control",
        "Israel-Palestine conflict",
        
        # Popular culture
        "Taylor Swift",
        "BTS (band)",
        "Game of Thrones",
        "Star Wars",
        
        # Sports teams with rivalries
        "Manchester United F.C.",
        "Real Madrid CF",
        
        # Technology/finance
        "Bitcoin",
        "Elon Musk",
        "Facebook",
        "Apple Inc.",
        
        # Historical events
        "September 11 attacks",
        "Holocaust",
        
        # Scientific topics
        "Evolution",
        "Vaccination"
    ]
    
    # Get a smaller subset if you want faster results
    # articles_to_process = vandalism_prone_articles[:5]  # Uncomment to use just 5 articles
    articles_to_process = vandalism_prone_articles
    
    # Get the data
    print(f"Collecting data for {len(articles_to_process)} articles...")
    combined_df = get_all_articles_data(articles_to_process, revs_per_article=50)
    
    # Save to CSV
    combined_df.to_csv('wikipedia_vandalism_dataset.csv', index=False)
    
    # Show basic stats
    print("\nDataset statistics:")
    print(f"Total revisions collected: {len(combined_df)}")
    print(f"Potential vandalism cases: {combined_df['possible_vandalism'].sum()}")
    print(f"Overall vandalism percentage: {combined_df['possible_vandalism'].mean() * 100:.2f}%")
    
    # Show per-article statistics
    print("\nVandalism rates by article:")
    article_stats = combined_df.groupby('article')['possible_vandalism'].agg(['count', 'sum', 'mean'])
    article_stats = article_stats.sort_values('mean', ascending=False)
    article_stats['percentage'] = article_stats['mean'] * 100
    print(article_stats.to_string(float_format="{:.2f}".format))

Collecting data for 24 articles...
Getting revision data for 'Donald Trump'...
Getting revision data for 'Joe Biden'...
Getting revision data for 'Hillary Clinton'...
Getting revision data for 'Barack Obama'...
Getting revision data for 'Vladimir Putin'...
Getting revision data for 'Climate change'...
Getting revision data for 'COVID-19'...
Getting revision data for 'Abortion'...
Getting revision data for 'Gun control'...
Getting revision data for 'Israel-Palestine conflict'...
Getting revision data for 'Taylor Swift'...
Getting revision data for 'BTS (band)'...
Getting revision data for 'Game of Thrones'...
Getting revision data for 'Star Wars'...
Getting revision data for 'Manchester United F.C.'...
Getting revision data for 'Real Madrid CF'...
Getting revision data for 'Bitcoin'...
Getting revision data for 'Elon Musk'...
Getting revision data for 'Facebook'...
Getting revision data for 'Apple Inc.'...
Getting revision data for 'September 11 attacks'...
Getting revision data for 'Ho

In [4]:
df = pd.read_csv("wikipedia_vandalism_dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   revid               1067 non-null   int64  
 1   user                1067 non-null   object 
 2   timestamp           1067 non-null   object 
 3   comment             921 non-null    object 
 4   size                1067 non-null   int64  
 5   diff_size           1043 non-null   float64
 6   article             1067 non-null   object 
 7   possible_vandalism  1067 non-null   bool   
dtypes: bool(1), float64(1), int64(2), object(4)
memory usage: 59.5+ KB
