# Group Project 1

## Download data

In [None]:
import os
import pandas as pd

# Define articles we want to download
article1 = "Liz Truss"
article2 = "Liz Truss Lettuce"

# Create necessary directories if they don't exist
os.makedirs("data", exist_ok=True)
os.makedirs("DataFrames", exist_ok=True)

# Download revisions for both articles
print("Downloading revisions for first article...")
os.system(f'python download_wiki_revisions.py "{article1}"')
print("\nDownloading revisions for second article...")
os.system(f'python download_wiki_revisions.py "{article2}"')

# Convert all downloaded revisions to DataFrames
print("\nConverting revisions to DataFrames...")
os.system('python xml_to_dataframe.py --data-dir ./data --output-dir ./DataFrames') 
# add batch size as int --batch-size
# include full text --include-text

# Load and verify one of the DataFrames
print("\nVerifying DataFrame contents...")
df = pd.read_feather(f"DataFrames/{article1}.feather")

# Display basic information about the DataFrame
print("\nDataFrame Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())

# Display some basic statistics
print("\nBasic statistics:")
print(f"Total number of revisions: {len(df)}")
print(f"Date range: from {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Number of unique editors: {df['username'].nunique()}")

In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.dates as mdates
from scipy.stats import spearmanr
from scipy.stats import pearsonr
import numpy as np
import matplotlib.colors as mcolors

In [None]:
# Info
# Liz Truss in office (6.9.2022-25.10.2022)
# Wikipedia page created for meme (20.10.2022)


In [None]:
# Input Datasets
liz_truss_df=pd.read_feather(Path().cwd()/"DataFrames"/"Liz Truss.feather")
lettuce_df=pd.read_feather(Path().cwd()/"DataFrames"/"Liz Truss lettuce.feather")

In [None]:
# No row is missing a timestap or revision_id so won't drop any rows

## Data Merging

In [None]:
# Group by day and month

lettuce_df_month=lettuce_df[["revision_id","timestamp"]].resample("M",on="timestamp").count().reset_index()
lettuce_df_day=lettuce_df[["revision_id","timestamp"]].resample("D",on="timestamp").count().reset_index()
liz_truss_df_month=liz_truss_df[["revision_id","timestamp"]].resample("M",on="timestamp").count().reset_index()
liz_truss_df_day=liz_truss_df[["revision_id","timestamp"]].resample("D",on="timestamp").count().reset_index()
lettuce_df_week=lettuce_df[["revision_id","timestamp"]].resample("W",on="timestamp").count().reset_index()
liz_truss_df_week=liz_truss_df[["revision_id","timestamp"]].resample("W",on="timestamp").count().reset_index()
lettuce_df_hour=lettuce_df[["revision_id","timestamp"]].resample("h",on="timestamp").count().reset_index()
liz_truss_df_hour=liz_truss_df[["revision_id","timestamp"]].resample("h",on="timestamp").count().reset_index()
lettuce_df_min=lettuce_df[["revision_id","timestamp"]].resample("30min",on="timestamp").count().reset_index()
liz_truss_df_min=liz_truss_df[["revision_id","timestamp"]].resample("30min",on="timestamp").count().reset_index()

In [None]:
# Format timestamp column

months_list=[lettuce_df_month,liz_truss_df_month]
days_list=[lettuce_df_day,liz_truss_df_day]
weeks_list=[liz_truss_df_week,lettuce_df_week]
liz_truss_list=[liz_truss_df_day,liz_truss_df_month,liz_truss_df_week]
lettuce_list=[lettuce_df_day,lettuce_df_month,lettuce_df_week]

#for df in months_list:
#    df["timestamp"]=pd.to_datetime(df["timestamp"])
#    df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m")

#for df in days_list:
#    df["timestamp"]=pd.to_datetime(df["timestamp"])
#    df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m-%d")

In [None]:
# Combine
for df in liz_truss_list:
    df["source"]="Liz Truss"

for df in lettuce_list:
    df["source"]="Lettuce"

months_df=pd.concat(months_list)
#days_df=pd.concat(days_list)
#weeks_df=pd.concat(weeks_list)


In [None]:
# Merge

weeks_df=liz_truss_df_week.merge(lettuce_df_week,on="timestamp",how="outer")
weeks_df.rename(columns={"revision_id_x": "revisions_liz", "revision_id_y": "revisions_lettuce"}, inplace=True)

## Overview Plot

In [None]:
# Overview plot

fig, ax = plt.subplots()
sns.scatterplot(data=months_df,x="timestamp",y="revision_id",hue="source")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.xticks(rotation=45)
plt.xlim(pd.to_datetime("2022-01-01"), pd.to_datetime("2024-12-31"))
plt.show()

## Plot for presentation

In [None]:
# For presentation

fig, ax = plt.subplots()
sns.scatterplot(data=weeks_df,x="timestamp",y="revisions_liz",alpha=0.7,color="#5d3eb3",label="Liz Truss")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.xticks(rotation=45)
plt.xlim(pd.to_datetime("2022-01-01"), pd.to_datetime("2024-12-31"))
ax.set_ylabel("Count of Revisions to Liz Trust page (week)")
ax.set_xlabel(None)

# Secound y-axis
ax2 = plt.gca().twinx()
sns.scatterplot(data=weeks_df,x="timestamp",y="revisions_lettuce",ax=ax2,color="#32a852", alpha=0.7,label="Lettuce Meme")
ax2.set_ylabel("Count of Revisions to Lettuce Meme page (week)")

# Vertical lines
resignation = pd.to_datetime("2022-10-20")
office= pd.to_datetime("2022-09-05")
ax.axvline(resignation, color="#F08080", linestyle='--', linewidth=1.5, label="Resignation")
ax.axvline(office, color="lightblue", linestyle='--', linewidth=1.5, label="Appointment")

# Combine legends from both axes
handles, labels = ax.get_legend_handles_labels()  # Get handles & labels from the first axis
handles2, labels2 = ax2.get_legend_handles_labels()  # Get handles & labels from the second axis
handles.extend(handles2)  # Combine handles
labels.extend(labels2)  # Combine labels
ax2.get_legend().remove() # Remove ax2 legend

# Create combined legend
ax.legend(handles, labels, title="Legend", loc="upper right")


plt.show()

## Lagged correlations

In [None]:
def correlation_with_lagged_x(y:pd.Series, x:pd.Series, max_lag:int) -> list:
    '''Calculate Spearman correlation between y and x where x is lagged by 0 until mag_lag periods''' 
    statistic=[]
    pvalue=[]
    n = len(y)
    for lag in range(max_lag + 1):
        y_truncated = y[:n - lag]  # Truncate y
        x_shifted = x[lag:]         # Shift x by lag
        corr = spearmanr(y_truncated, x_shifted)
        statistic.append(corr.statistic)
        pvalue.append(corr.pvalue)
    return statistic,pvalue

def heatmap_for_correlation(correlation:list,pvalue:list,column_name:str,max_lag:int) -> None:
    '''Creates a heatmap to visualise the correlations between y and lagged x'''
    correlation = np.array(correlation).reshape(1, -1)  # Reshape for heatmap
    pvalue = np.array(pvalue).reshape(1, -1)  # Reshape for heatmap
    heatmap_df = pd.DataFrame(correlation,  # Stack p-values and statistics vertically
        index=["Spearman Correlation"],  # Labels for the two rows
        columns=[f'{column_name} {lag}' for lag in range(max_lag + 1)]
        )
    
    return heatmap_df
    

In [None]:
day_df=liz_truss_df_day.merge(lettuce_df_day,on="timestamp",how="outer") \
                        .rename(columns={"revision_id_x": "revisions_liz", "revision_id_y": "revisions_lettuce"}) \
                        .dropna() \
                        .sort_values(by="timestamp", ascending=True)
# merge dataframes, rename columns, drop all days before lettuce meme was created & sort dataframe based on dates

max_lag=10

statistic,pvalue=correlation_with_lagged_x(day_df["revisions_lettuce"],day_df["revisions_liz"],max_lag)

heatmap_df=heatmap_for_correlation(statistic,pvalue,"Day",max_lag)


# Plotting Heatmap
plt.figure(figsize=(15, 5))
sns.heatmap(heatmap_df, annot=True, cmap='coolwarm',linewidths=0.5, linecolor='gray',fmt=".2f")
plt.title('Pearson Correlation between Counts of Meme Revisions and Lagged Counts of Liz Truss Revisions')
plt.show()

In [None]:
# Prepare data for correlations
hour_df=liz_truss_df_hour.merge(lettuce_df_hour,on="timestamp",how="outer") \
                        .rename(columns={"revision_id_x": "revisions_liz", "revision_id_y": "revisions_lettuce"}) \
                        .dropna() \
                        .sort_values(by="timestamp", ascending=True)
# merge dataframes, rename columns, drop all days before lettuce meme was created & sort dataframe based on dates

max_lag=12

statistic,pvalue=correlation_with_lagged_x(hour_df["revisions_lettuce"],hour_df["revisions_liz"],max_lag)

heatmap_df=heatmap_for_correlation(statistic,pvalue,"Hour",max_lag)


# Plotting Heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(heatmap_df, annot=True, cmap='coolwarm',linewidths=0.5, linecolor='gray',fmt=".2f",cbar=False)
plt.title('Spearman Correlation between Counts of Meme Revisions and Lagged Counts of Liz Truss Revisions')
plt.show()

In [None]:
# Prepare data for correlations
hour_df=liz_truss_df_hour.merge(lettuce_df_hour,on="timestamp",how="outer") \
                        .rename(columns={"revision_id_x": "revisions_liz", "revision_id_y": "revisions_lettuce"}) \
                        .dropna() \
                        .sort_values(by="timestamp", ascending=True)
# merge dataframes, rename columns, drop all days before lettuce meme was created & sort dataframe based on dates

max_lag=12

statistic,pvalue=correlation_with_lagged_x(hour_df["revisions_liz"],hour_df["revisions_lettuce"],max_lag)

heatmap_df=heatmap_for_correlation(statistic,pvalue,"Hour",max_lag)


# Plotting Heatmap
plt.figure(figsize=(10, 4))
sns.heatmap(heatmap_df, annot=True, cmap='coolwarm',linewidths=0.5, linecolor='gray',fmt=".2f",cbar=False)
plt.title('Spearman Correlation between Counts of Liz Truss Revisions and Lagged Counts of Meme Revisions')
plt.show()

## Author Analysis

In [None]:
from scipy.stats import chi2_contingency

# Who edits which page?
# Authors of the lettuce meme Wikipedia page exhibit a higher fraction of "recurring" authors compared to the Liz Truss Wikipedia page, while the Liz Truss page has a higher fraction of "once" authors compared to the meme page
# A small community of the same people will edit the meme while there are only a few "once-a-time" authors since the meme is a niche page on Wikipedia not many people may know about
# A big community of people will edit the Liz Truss page with a small amount of people that do it recurringly since Liz Truss is a well-known politician


# Restrict liz_truss_df to time where meme page existed
liz_truss_df_recent=liz_truss_df[liz_truss_df["timestamp"] >= lettuce_df.sort_values(by="timestamp", ascending=True)["timestamp"][0]]

# Drop rows with missing usernames
liz_truss_df_recent.dropna(subset="username",inplace=True)
lettuce_df.dropna(subset="username",inplace=True)

# Determine author groups
liz_truss_editor=set(liz_truss_df_recent["username"])
lettuce_editor=set(lettuce_df["username"])
common_editor=liz_truss_editor & lettuce_editor
only_liz_truss_editor=liz_truss_editor-lettuce_editor
only_lettuce_editor=lettuce_editor-liz_truss_editor

# Count how many revision per author
liz_truss_counts=liz_truss_df_recent[["username","revision_id"]].groupby("username").count().rename(columns={"revision_id":"counts"})
lettuce_counts=lettuce_df[["username","revision_id"]].groupby("username").count().rename(columns={"revision_id":"counts"})

# Create user category
dfs=[liz_truss_counts,lettuce_counts]

for df in dfs:
    df.loc[df.index.isin(only_liz_truss_editor), "user_category"] = "Liz Truss"
    df.loc[df.index.isin(only_lettuce_editor), "user_category"] = "Lettuce"
    df.loc[df.index.isin(common_editor), "user_category"] = "Both"

# Define if "one-time" or "recurring" author
liz_truss_counts["user_type"] = liz_truss_counts["counts"].apply(lambda x: "Recurring" if x > 1 else "Once")
lettuce_counts["user_type"] = lettuce_counts["counts"].apply(lambda x: "Recurring" if x > 1 else "Once")
lettuce_counts["page"]="Lettuce"
liz_truss_counts["page"] ="Liz Truss"

# merge lettuce and Liz Truss
df_merged=pd.concat([liz_truss_counts,lettuce_counts])

# Carry out chi2
chi2, p, dof, expected=chi2_contingency(pd.crosstab(df_merged["page"],df_merged["user_type"]))
expected_df = pd.DataFrame(expected, columns=["Once", "Recurring"], index=["Lettuce", "Liz Truss"]).astype(int)

print(f"We reject the H0 (distribution of user types is the same across the meme and Liz Truss page) since the p-value is {p}.")

print(f"\nActual distribution: \n{pd.crosstab(df_merged['page'],df_merged['user_type'])}")

print(f"\nExpected distribution: \n{expected_df}")
