In [10]:
import sys

!{sys.executable} -m pip install mwclient

!{sys.executable} -m pip install transformers

!{sys.executable} -m pip install torch torchvision torchaudio

!{sys.executable} -m pip install tensorflow

!{sys.executable} -m pip install tf-keras

!{sys.executable} -m pip install pandas

In [74]:
import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Ethereum']


In [75]:
revs = list(page.revisions())
revs[0]

OrderedDict([('revid', 1219872278),
             ('parentid', 1218713785),
             ('minor', ''),
             ('user', '0dd b1t'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=4, tm_mday=20, tm_hour=10, tm_min=41, tm_sec=36, tm_wday=5, tm_yday=111, tm_isdst=-1)),
             ('comment',
              '/* History */ seems like Buterin means only blockchain, not Bitcoin even particularly cause BTC cannot apply Layer2')])

In [76]:
revs = sorted(revs, key=lambda rev: rev["timestamp"]) ##Sorting by timpestamp

In [77]:
revs[0]

OrderedDict([('revid', 592567939),
             ('parentid', 0),
             ('user', 'Sanpitch'),
             ('timestamp',
              time.struct_time(tm_year=2014, tm_mon=1, tm_mday=27, tm_hour=1, tm_min=53, tm_sec=45, tm_wday=0, tm_yday=27, tm_isdst=-1)),
             ('comment',
              "[[WP:AES|←]]Created page with '{{Infobox currency | image_1 =  | image_title_1 =  | image_width_1 =  | image_2 =  | image_title_2 =  | image_width_2 =  |issuing_authority = None. The Ethereum...'")])

In [78]:
import torch

In [79]:
from transformers import pipeline
sentiment_pipeline = pipeline("text-classification")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [80]:
find_sentiment("")

0.7481209635734558

In [81]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [82]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

edits

In [83]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [84]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2014-01-27,1,-0.998511,1.000000
2014-02-01,1,-0.997276,1.000000
2014-04-06,5,0.790979,0.000000
2014-04-09,24,0.646407,0.083333
2014-04-10,9,-0.361518,0.666667
...,...,...,...
2024-03-19,1,-0.999787,1.000000
2024-04-01,2,-0.999709,1.000000
2024-04-03,2,-0.010781,0.500000
2024-04-13,1,-0.999720,1.000000


In [85]:
edits_df.index = pd.to_datetime(edits_df.index)


In [93]:
from datetime import datetime

dates = pd.date_range(start="2014-04-06",end=datetime.today())

In [94]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [95]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2014-04-06,5,0.790979,0.000000
2014-04-07,0,0.000000,0.000000
2014-04-08,0,0.000000,0.000000
2014-04-09,24,0.646407,0.083333
2014-04-10,9,-0.361518,0.666667
...,...,...,...
2024-05-10,0,0.000000,0.000000
2024-05-11,0,0.000000,0.000000
2024-05-12,0,0.000000,0.000000
2024-05-13,0,0.000000,0.000000


In [96]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [97]:
rolling_edits = rolling_edits.dropna()

In [98]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2014-05-05,2.133333,0.119197,0.092778
2014-05-06,1.966667,0.092831,0.092778
2014-05-07,1.966667,0.092831,0.092778
2014-05-08,1.966667,0.092831,0.092778
2014-05-09,1.166667,0.071284,0.090000
...,...,...,...
2024-05-10,0.066667,-0.066594,0.066667
2024-05-11,0.066667,-0.066594,0.066667
2024-05-12,0.066667,-0.066594,0.066667
2024-05-13,0.033333,-0.033270,0.033333


In [99]:
rolling_edits.to_csv("wikipedia_edits.csv")