In [1]:
# importing necessary libraries
import requests
from bs4 import BeautifulSoup
import wget
import calendar
import pickle
import re
import os
import sys
import time
import json
import pandas as pd
from sqlite3 import dbapi2 as sq3
from collections import Counter
from pathlib import Path
import numpy as np
import copy
from matplotlib import pyplot as plt
import country_converter as coco


from collections import OrderedDict
PATHSTART = '.'
pd.options.display.max_columns = None

In [None]:
# function to delete a file
def del_file(file):
    if os.path.exists(file) == True:
        os.remove(file)
        print('\n' + file + ' is deleted')
    else:
        print('No such file found')

In [2]:
# connecting to database file
master_db = sq3.connect('master.db')

In [13]:
# loading list of new users from database file to a dataframe
new_users_df = pd.read_sql_query('''SELECT * FROM newusers''', master_db)
new_users_df['country'].fillna('Unknown', inplace=True)
new_users_df

Unnamed: 0,user_name,cohort,campaign,cat,country
130478,Deshamer,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands
130479,Den Burg,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands
130480,Debruinjesse,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands
130481,Wendy de Bert,2014,WLE,Images_from_Wiki_Loves_Earth_2014_in_the_Nethe...,Netherlands
130482,D3eam4,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands
...,...,...,...,...,...
131241,Steven van der Wal,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands
131242,Taboe007,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands
131243,Timbeglinger,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands
131244,Ton Engwirda,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands


In [9]:
# loading all edits table to a dataframe and dropping duplicate values along with removing page and user edits
all_edits_df = pd.read_sql_query('''SELECT * FROM alledits''', master_db)
all_edits_df.drop_duplicates(subset=['wiki_db', 'event_user_text', 'event_user_registration_timestamp', 'revision_id'], 
                              keep='first', inplace=True, ignore_index=True)
all_edits_df = all_edits_df[(all_edits_df['event_entity']!='user') & (all_edits_df['event_entity']!='page')]

# processing the edit timestamp data and getting required columns
all_edits_df['event_timestamp'] = all_edits_df['event_timestamp'].astype('datetime64[s]')
all_edits_df['event_month_number'] = all_edits_df['event_timestamp'].apply(lambda x: x.strftime('%Y-%m'))
all_edits_df['event_month_year'] = all_edits_df['event_timestamp'].apply(lambda x: x.strftime("%B-%Y"))
all_edits_df['event_timestamp'] = all_edits_df['event_timestamp'].apply(lambda x: int(x.strftime("%Y%m%d%H%M%S")))

# saving the all edits dataframe as a csv file for convenience
all_edits_df.to_csv('alledits_data.csv', index=False)
df_columns = all_edits_df.columns.to_list()
del all_edits_df

In [23]:
# reading the saved csv file
all_edits_df = pd.read_csv('alledits_data.csv')

# dropping unnecessary columns and changing event_user column name to user_name 
all_edits_df = all_edits_df.drop(['event_user_registration_timestamp', 'revision_id', 'event_entity'], axis=1)
all_edits_df = all_edits_df.rename(columns={'event_user_text':'user_name'})
all_edits_df

Unnamed: 0,wiki_db,event_entity,event_timestamp,event_user_text,event_user_registration_timestamp,revision_id,event_month_number,event_month_year
0,frwiktionary,revision,20100325165616,Helicoman,2010-03-25 16:54:20.0,7065871.0,2010-03,March-2010
1,frwiktionary,revision,20100325171511,Helicoman,2010-03-25 16:54:20.0,7065916.0,2010-03,March-2010
2,frwiktionary,revision,20100703105142,Davitof,2009-03-08 20:23:09.0,9041840.0,2010-07,July-2010
3,frwiktionary,revision,20110303223810,Leanj,2011-03-03 22:36:10.0,8689022.0,2011-03,March-2011
4,frwiktionary,revision,20110920085939,Jps726,2011-09-14 09:42:28.0,9454512.0,2011-09,September-2011
...,...,...,...,...,...,...,...,...
12636569,commonswiki,revision,20220731235352,JoachimKohler-HB,2012-09-30 12:06:12.0,679418979.0,2022-07,July-2022
12636570,commonswiki,revision,20220731235434,JoachimKohler-HB,2012-09-30 12:06:12.0,679419032.0,2022-07,July-2022
12636571,commonswiki,revision,20220731235914,Joaocoutinho1,2021-08-21 16:32:13.0,679419448.0,2022-07,July-2022
12636572,commonswiki,revision,20220731235915,Joaocoutinho1,2021-08-21 16:32:13.0,679419453.0,2022-07,July-2022


In [14]:
# loading campaigns info from a csv file for list of unique countries
campaigns_df = pd.read_csv('major_campaigns_timelines.csv')
campaigns_df['country'].fillna('Unknown', inplace=True)
countries_df = pd.DataFrame(campaigns_df['country'].unique())
countries_df.rename(columns={0:'country'}, inplace=True)

# generating iso codes for list of unique countries
cc = coco.CountryConverter()
iso3_codes = coco.convert(names=list(countries_df['country']), to='ISO3', not_found=None)
countries_df.insert(1, 'ISO3', iso3_codes)
country_iso_dict = {'South_Korea':'KOR', 'Caribbean':'BES', 
                    'Dutch Caribbean':'BES', 'Basque Country':'ESP',
                    'Republika Srpska':'BIH', 'Cape_Verde':'CPV', 'an_unknown_country':'None',
                    'with_unknown_country':'None', 'with_no_country':'None',
                    'Biosphere_Reserves':'None', '':'None', 'Unknown':'None'}

countries_df = countries_df.replace({'ISO3':country_iso_dict})
iso_dict = dict(zip(countries_df.country, countries_df.ISO3))
campaigns_df['iso_alpha3'] = campaigns_df['country'].map(iso_dict)

# saving the modified data into a new csv file
campaigns_df.to_csv('major_campaigns_timelines_iso.csv',index=False)
campaigns_df

with_no_country not found in regex
Cape_Verde not found in regex
with_unknown_country not found in regex
Biosphere_Reserves not found in regex
South_Korea not found in regex
Caribbean not found in regex
Wales not found in regex
Basque Country not found in regex
Dutch Caribbean not found in regex
Republika Srpska not found in regex


Unnamed: 0,year,country,start_date,end_date,cat,campaign,iso_alpha3
0,2014,with_no_country,20141001000000,20141130235959,Images_from_Wiki_Loves_Africa_2014_in_an_unkno...,WLA,
1,2014,Algeria,20141001000000,20141130235959,Images_from_Wiki_Loves_Africa_2014_in_Algeria,WLA,DZA
2,2014,Angola,20141001000000,20141130235959,Images_from_Wiki_Loves_Africa_2014_in_Angola,WLA,AGO
3,2014,Benin,20141001000000,20141130235959,Images_from_Wiki_Loves_Africa_2014_in_Benin,WLA,BEN
4,2014,Botswana,20141001000000,20141130235959,Images_from_Wiki_Loves_Africa_2014_in_Botswana,WLA,BWA
...,...,...,...,...,...,...,...
1241,2017,with_no_country,20170901000000,20170930235959,Images_from_Wiki_Loves_Monuments_2017,WLM,
1242,2018,with_no_country,20180901000000,20180930235959,Images_from_Wiki_Loves_Monuments_2018,WLM,
1243,2019,with_no_country,20190901000000,20190930235959,Images_from_Wiki_Loves_Monuments_2019,WLM,
1244,2020,with_no_country,20200901000000,20201107235959,Images_from_Wiki_Loves_Monuments_2020,WLM,


In [16]:
# mapping category with campaign start and end dates 
campaigns_df = pd.read_csv('major_campaigns_timelines_iso.csv')
enddate_map_dict = dict(zip(campaigns_df.cat, campaigns_df.end_date))
startdate_map_dict = dict(zip(campaigns_df.cat, campaigns_df.start_date))

# mapping countries of new users with campaign country iso codes  
iso_map_dict = dict(zip(campaigns_df.country, campaigns_df.iso_alpha3))
new_users_df['end_date'] = new_users_df['cat'].map(enddate_map_dict)
new_users_df['start_date'] = new_users_df['cat'].map(startdate_map_dict)
new_users_df['iso_code'] = new_users_df['country'].map(iso_map_dict)
new_users_df

Unnamed: 0,user_name,cohort,campaign,cat,country,end_date,start_date,iso_code
130478,Deshamer,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands,20120930235959,20120901000000,NLD
130479,Den Burg,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands,20120930235959,20120901000000,NLD
130480,Debruinjesse,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands,20120930235959,20120901000000,NLD
130481,Wendy de Bert,2014,WLE,Images_from_Wiki_Loves_Earth_2014_in_the_Nethe...,Netherlands,20140630235959,20140501000000,NLD
130482,D3eam4,2012,WLM,Images_from_Wiki_Loves_Monuments_2012_in_the_N...,Netherlands,20120930235959,20120901000000,NLD
...,...,...,...,...,...,...,...,...
131241,Steven van der Wal,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands,20100930235959,20100901000000,NLD
131242,Taboe007,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands,20100930235959,20100901000000,NLD
131243,Timbeglinger,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands,20100930235959,20100901000000,NLD
131244,Ton Engwirda,2010,WLM,Images_from_Wiki_Loves_Monuments_2010,Netherlands,20100930235959,20100901000000,NLD


In [28]:
# merging necessary data and dropping edits of all users before the end of campaign
processed_df = new_users_df.merge(all_edits_df, how='inner', left_on='user_name', right_on='user_name')
processed_df.drop(processed_df[processed_df['event_timestamp'] <= processed_df['end_date']].index, inplace = True)

# dropping unnecessary columns
processed_df.drop(['cat', 'start_date', 'end_date'], axis=1, inplace=True)
processed_df

Unnamed: 0,user_name,cohort,campaign,country,iso_code,wiki_db,event_timestamp,event_month_number,event_month_year
80,Migjen Fazliu,2016,WLE,Albania,ALB,commonswiki,20160926193236,2016-09,September-2016
118,Musli Berisha,2016,WLE,Albania,ALB,commonswiki,20161004084522,2016-10,October-2016
119,Musli Berisha,2016,WLE,Albania,ALB,commonswiki,20161004093433,2016-10,October-2016
120,Musli Berisha,2016,WLE,Albania,ALB,commonswiki,20161004094136,2016-10,October-2016
121,Musli Berisha,2016,WLE,Albania,ALB,commonswiki,20161004094136,2016-10,October-2016
...,...,...,...,...,...,...,...,...,...
12636561,NiaPol,2018,WLE,with_no_country,,commonswiki,20210529171514,2021-05,May-2021
12636562,NiaPol,2018,WLE,with_no_country,,commonswiki,20210529171743,2021-05,May-2021
12636563,NiaPol,2018,WLE,with_no_country,,commonswiki,20210529171903,2021-05,May-2021
12636564,NiaPol,2018,WLE,with_no_country,,commonswiki,20220722090007,2022-07,July-2022


In [29]:
# saving the final processed data for app into a csv file 
final_df = processed_df.groupby(['user_name', 'event_month_number', 'event_month_year', 'country', 
                                 'campaign', 'cohort', 'wiki_db', 'iso_code']).size().reset_index(name='edit_count')
final_df = final_df.sort_values(by=['event_month_number', 'event_month_year', 'country', 'campaign', 'cohort', 'wiki_db'])
final_df.to_csv('monthwise_filtered_data.csv', index=False)
final_df

Unnamed: 0,user_name,event_month_number,event_month_year,country,campaign,cohort,wiki_db,iso_code,edit_count
1728,AFiermas,2010-10,October-2010,Netherlands,WLM,2010,commonswiki,NLD,1
17137,CWKramer,2010-10,October-2010,Netherlands,WLM,2010,commonswiki,NLD,144
20516,D1N0F070,2010-10,October-2010,Netherlands,WLM,2010,commonswiki,NLD,2
25520,Edithnib,2010-10,October-2010,Netherlands,WLM,2010,commonswiki,NLD,15
27646,Erik009,2010-10,October-2010,Netherlands,WLM,2010,commonswiki,NLD,4
...,...,...,...,...,...,...,...,...,...
109768,Саня Новиков,2022-09,September-2022,Russia,WLM,2015,ruwikivoyage,RUS,1
12518,Basingo,2022-09,September-2022,Sweden,WLE,2020,svwikisource,SWE,31
69244,Nalle&Lisa,2022-09,September-2022,Sweden,WLM,2012,svwiki,SWE,13
108092,Василюк Олексій,2022-09,September-2022,Ukraine,WLE,2013,ukwiki,UKR,1


In [None]:
# deleting unnecessary files
del_file('alledits_data.csv')
del_file('major_campaigns_timelines_iso.csv')