# Data Analysis for Wikipedia Rare Disease Articles

## 1. Import necessary libraries

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

## 2. Load JSON data

In [2]:
def load_json_data(filename):
    with open(filename, 'r') as f:
        return json.load(f)

mobile_data = load_json_data('Json_data/rare-disease_monthly_mobile_201507-202409.json')
desktop_data = load_json_data('Json_data/rare-disease_monthly_desktop_201507-202409.json')
cumulative_data = load_json_data('Json_data/rare-disease_monthly_cumulative_201507-202409.json')

## 3. Data processing functions

In [3]:
def calculate_average_views(data):
    return {article: sum(views.values()) / len(views) if views else 0 for article, views in data.items()}

def find_peak_views(data):
    return {article: max(views.values()) if views else 0 for article, views in data.items()}

def count_months_of_data(data):
    return {article: len(views) for article, views in data.items()}

## 4. Visualization functions

In [4]:
def plot_max_min_avg(mobile_data, desktop_data):
    mobile_avg = calculate_average_views(mobile_data)
    desktop_avg = calculate_average_views(desktop_data)
    
    max_mobile = max(mobile_avg, key=mobile_avg.get)
    min_mobile = min(mobile_avg, key=mobile_avg.get)
    max_desktop = max(desktop_avg, key=desktop_avg.get)
    min_desktop = min(desktop_avg, key=desktop_avg.get)
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    for article in [max_mobile, min_mobile]:
        if mobile_data[article]:
            ax1.plot(mobile_data[article].keys(), mobile_data[article].values(), label=f'{article} (Mobile)')
    for article in [max_desktop, min_desktop]:
        if desktop_data[article]:
            ax2.plot(desktop_data[article].keys(), desktop_data[article].values(), label=f'{article} (Desktop)')
    
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Page Views')
    ax1.set_title('Maximum and Minimum Average Page Views (Mobile)')
    ax1.legend()
    ax1.tick_params(axis='x', rotation=45)
    
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Page Views')
    ax2.set_title('Maximum and Minimum Average Page Views (Desktop)')
    ax2.legend()
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('visualization_images/max_min_avg_pageviews.png')
    plt.close()

In [5]:
def plot_top_10_peak(mobile_data, desktop_data):
    mobile_peak = find_peak_views(mobile_data)
    desktop_peak = find_peak_views(desktop_data)
    
    top_10_mobile = sorted(mobile_peak.items(), key=lambda x: x[1], reverse=True)[:10]
    top_10_desktop = sorted(desktop_peak.items(), key=lambda x: x[1], reverse=True)[:10]
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    sns.barplot(x=[views for _, views in top_10_mobile], y=[article for article, _ in top_10_mobile], ax=ax1)
    ax1.set_title('Top 10 Peak Page Views (Mobile)')
    ax1.set_xlabel('Peak Views')
    ax1.set_ylabel('Article')
    
    sns.barplot(x=[views for _, views in top_10_desktop], y=[article for article, _ in top_10_desktop], ax=ax2)
    ax2.set_title('Top 10 Peak Page Views (Desktop)')
    ax2.set_xlabel('Peak Views')
    ax2.set_ylabel('Article')
    
    plt.tight_layout()
    plt.savefig('visualization_images/top_10_peak_pageviews.png')
    plt.close()

In [6]:
def plot_fewest_months(mobile_data, desktop_data):
    mobile_months = count_months_of_data(mobile_data)
    desktop_months = count_months_of_data(desktop_data)
    
    fewest_10_mobile = sorted(mobile_months.items(), key=lambda x: x[1])[:10]
    fewest_10_desktop = sorted(desktop_months.items(), key=lambda x: x[1])[:10]
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    sns.barplot(y=[article for article, _ in fewest_10_mobile], x=[months for _, months in fewest_10_mobile], ax=ax1)
    ax1.set_title('10 Articles with Fewest Months of Data (Mobile)')
    ax1.set_xlabel('Number of Months')
    ax1.set_ylabel('Article')
    
    sns.barplot(y=[article for article, _ in fewest_10_desktop], x=[months for _, months in fewest_10_desktop], ax=ax2)
    ax2.set_title('10 Articles with Fewest Months of Data (Desktop)')
    ax2.set_xlabel('Number of Months')
    ax2.set_ylabel('Article')
    
    plt.tight_layout()
    plt.savefig('visualization_images/fewest_months_data.png')
    plt.close()

## 5. Additional analysis functions

In [7]:
def analyze_trends(data):
    trends = defaultdict(lambda: {'increasing': 0, 'decreasing': 0, 'stable': 0})
    for article, views in data.items():
        sorted_views = sorted(views.items())
        for i in range(1, len(sorted_views)):
            prev_month, prev_views = sorted_views[i-1]
            curr_month, curr_views = sorted_views[i]
            if curr_views > prev_views:
                trends[article]['increasing'] += 1
            elif curr_views < prev_views:
                trends[article]['decreasing'] += 1
            else:
                trends[article]['stable'] += 1
    return trends

In [8]:
def plot_overall_trend(data, title):
    
    months = sorted(list(next(iter(data.values())).keys()))
    total_views = [sum(article.get(month, 0) for article in data.values()) for month in months]
    
    plt.figure(figsize=(15, 6))
    plt.plot(months, total_views)
    plt.title(f'Overall Trend of {title} Views')
    plt.xlabel('Month')
    plt.ylabel('Total Views')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'visualization_images/overall_trend_{title.lower()}.png')
    plt.close()

## 6. Perform analysis and create visualizations

In [9]:
plot_max_min_avg(mobile_data, desktop_data)
plot_top_10_peak(mobile_data, desktop_data)
plot_fewest_months(mobile_data, desktop_data)

mobile_trends = analyze_trends(mobile_data)
desktop_trends = analyze_trends(desktop_data)

plot_overall_trend(mobile_data, 'Mobile')
plot_overall_trend(desktop_data, 'Desktop')

print("Analysis complete. Visualizations have been saved in the output directory.")



Analysis complete. Visualizations have been saved in the output directory.


## 7. Print some summary statistics

In [10]:
print("\nSummary Statistics:")
print(f"Total number of articles: {len(mobile_data)}")
print(f"Average number of months of data (Mobile): {sum(count_months_of_data(mobile_data).values()) / len(mobile_data):.2f}")
print(f"Average number of months of data (Desktop): {sum(count_months_of_data(desktop_data).values()) / len(desktop_data):.2f}")

print("\nTop 5 articles by average views (Mobile):")
for article, views in sorted(calculate_average_views(mobile_data).items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{article}: {views:.2f}")

print("\nTop 5 articles by average views (Desktop):")
for article, views in sorted(calculate_average_views(desktop_data).items(), key=lambda x: x[1], reverse=True)[:5]:
    print(f"{article}: {views:.2f}")

print("\nArticles with increasing trend (more than 50% of the time):")
for article, trend in mobile_trends.items():
    total = sum(trend.values())
    if total > 0 and trend['increasing'] / total > 0.5:
        print(f"{article} (Mobile): {trend['increasing'] / total:.2%}")
for article, trend in desktop_trends.items():
    total = sum(trend.values())
    if total > 0 and trend['increasing'] / total > 0.5:
        print(f"{article} (Desktop): {trend['increasing'] / total:.2%}")


Summary Statistics:
Total number of articles: 1774
Average number of months of data (Mobile): 107.79
Average number of months of data (Desktop): 107.79

Top 5 articles by average views (Mobile):
Black Death: 177697.85
Tuberculosis: 155546.10
Multiple sclerosis: 122010.30
Amyotrophic lateral sclerosis: 101279.24
Leprosy: 94630.02

Top 5 articles by average views (Desktop):
Black Death: 104859.32
Tuberculosis: 71768.62
Multiple sclerosis: 57457.68
Smallpox: 55645.26
Dopamine: 48815.49

Articles with increasing trend (more than 50% of the time):
Acanthocheilonemiasis (Mobile): 50.91%
Osteochondrodysplasia (Mobile): 51.82%
Fibroblast growth factor receptor 3 (Mobile): 51.82%
Barraquer–Simons syndrome (Mobile): 51.82%
Acrodermatitis enteropathica (Mobile): 50.91%
Spinal cord injury (Mobile): 51.82%
Budd–Chiari syndrome (Mobile): 52.73%
Thrombosis (Mobile): 52.73%
Breastfeeding difficulties (Mobile): 53.64%
Cerebral autosomal recessive arteriopathy with subcortical infarcts and leukoencepha