In [None]:
# importing necessary libraries
import requests
from bs4 import BeautifulSoup
import wget
import calendar
import pickle
import re
import os
import sys
import time
import json
import pandas as pd
from sqlite3 import dbapi2 as sq3
from collections import Counter
from pathlib import Path
import numpy as np
import copy
from matplotlib import pyplot as plt
import country_converter as coco


from collections import OrderedDict
PATHSTART = '.'
pd.options.display.max_columns = None

In [None]:
# function to delete a file
def del_file(file):
    if os.path.exists(file) == True:
        os.remove(file)
        print('\n' + file + ' is deleted')
    else:
        print('No such file found')

In [None]:
# connecting to database file
master_db = sq3.connect('master.db')

In [None]:
# loading list of new users from database file to a dataframe
new_users_df = pd.read_sql_query('''SELECT * FROM newusers''', master_db)
new_users_df['country'].fillna('Unknown', inplace=True)
new_users_df

In [None]:
# loading all edits table to a dataframe and dropping duplicate values along with removing page and user edits
all_edits_df = pd.read_sql_query('''SELECT * FROM alledits''', master_db)
all_edits_df.drop_duplicates(subset=['wiki_db', 'event_user_text', 'event_user_registration_timestamp', 'revision_id'], 
                              keep='first', inplace=True, ignore_index=True)
all_edits_df = all_edits_df[(all_edits_df['event_entity']!='user') & (all_edits_df['event_entity']!='page')]

# processing the edit timestamp data and getting required columns
all_edits_df['event_timestamp'] = all_edits_df['event_timestamp'].astype('datetime64[s]')
all_edits_df['event_month_number'] = all_edits_df['event_timestamp'].apply(lambda x: x.strftime('%Y-%m'))
all_edits_df['event_month_year'] = all_edits_df['event_timestamp'].apply(lambda x: x.strftime("%B-%Y"))
all_edits_df['event_timestamp'] = all_edits_df['event_timestamp'].apply(lambda x: int(x.strftime("%Y%m%d%H%M%S")))

# saving the all edits dataframe as a csv file for convenience
all_edits_df.to_csv('alledits_data.csv', index=False)
df_columns = all_edits_df.columns.to_list()
del all_edits_df

In [None]:
# reading the saved csv file
all_edits_df = pd.read_csv('alledits_data.csv')

# dropping unnecessary columns and changing event_user column name to user_name 
all_edits_df = all_edits_df.drop(['event_user_registration_timestamp', 'revision_id', 'event_entity'], axis=1)
all_edits_df = all_edits_df.rename(columns={'event_user_text':'user_name'})
all_edits_df

In [None]:
# loading campaigns info from a csv file for list of unique countries
campaigns_df = pd.read_csv('major_campaigns_timelines.csv')
campaigns_df['country'].fillna('Unknown', inplace=True)
countries_df = pd.DataFrame(campaigns_df['country'].unique())
countries_df.rename(columns={0:'country'}, inplace=True)

# generating iso codes for list of unique countries
cc = coco.CountryConverter()
iso3_codes = coco.convert(names=list(countries_df['country']), to='ISO3', not_found=None)
countries_df.insert(1, 'ISO3', iso3_codes)
country_iso_dict = {'South_Korea':'KOR', 'Caribbean':'BES', 
                    'Dutch Caribbean':'BES', 'Basque Country':'ESP',
                    'Republika Srpska':'BIH', 'Cape_Verde':'CPV', 'an_unknown_country':'None',
                    'with_unknown_country':'None', 'with_no_country':'None',
                    'Biosphere_Reserves':'None', '':'None', 'Unknown':'None'}

countries_df = countries_df.replace({'ISO3':country_iso_dict})
iso_dict = dict(zip(countries_df.country, countries_df.ISO3))
campaigns_df['iso_alpha3'] = campaigns_df['country'].map(iso_dict)

# saving the modified data into a new csv file
campaigns_df.to_csv('major_campaigns_timelines_iso.csv',index=False)
campaigns_df

In [None]:
# mapping category with campaign start and end dates 
campaigns_df = pd.read_csv('major_campaigns_timelines_iso.csv')
enddate_map_dict = dict(zip(campaigns_df.cat, campaigns_df.end_date))
startdate_map_dict = dict(zip(campaigns_df.cat, campaigns_df.start_date))

# mapping countries of new users with campaign country iso codes  
iso_map_dict = dict(zip(campaigns_df.country, campaigns_df.iso_alpha3))
new_users_df['end_date'] = new_users_df['cat'].map(enddate_map_dict)
new_users_df['start_date'] = new_users_df['cat'].map(startdate_map_dict)
new_users_df['iso_code'] = new_users_df['country'].map(iso_map_dict)
new_users_df

In [None]:
# merging necessary data and dropping edits of all users before the end of campaign
processed_df = new_users_df.merge(all_edits_df, how='inner', left_on='user_name', right_on='user_name')
processed_df.drop(processed_df[processed_df['event_timestamp'] <= processed_df['end_date']].index, inplace = True)

# dropping unnecessary columns
processed_df.drop(['cat', 'start_date', 'end_date'], axis=1, inplace=True)
processed_df

In [None]:
# saving the final processed data for app into a csv file 
final_df = processed_df.groupby(['user_name', 'event_month_number', 'event_month_year', 'country', 
                                 'campaign', 'cohort', 'wiki_db', 'iso_code']).size().reset_index(name='edit_count')
final_df = final_df.sort_values(by=['event_month_number', 'event_month_year', 'country', 'campaign', 'cohort', 'wiki_db'])
final_df.to_csv('monthwise_filtered_data.csv', index=False)
final_df

In [None]:
# deleting unnecessary files
del_file('alledits_data.csv')
del_file('major_campaigns_timelines_iso.csv')