In [1]:
# Requirements
import pandas as pd
import numpy as np
import datetime as dt
import sklearn

import re
import nltk
from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

import warnings
warnings.filterwarnings('ignore')

In [2]:
wiki = pd.read_csv('wikiRfA.csv')
wiki.head()

Unnamed: 0,SOURCE,TARGET,VOTE,RESULT,YEAR,DATE,TEXT
0,Steel1943,BDD,1,1,2013,19/04/2013 23:13,'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,20/04/2013 01:04,'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,19/04/2013 23:43,'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,20/04/2013 00:11,'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,20/04/2013 00:56,"'''Support''', with great pleasure. I work wit..."


In [3]:
# Check data types
wiki.dtypes

SOURCE    object
TARGET    object
VOTE       int64
RESULT     int64
YEAR       int64
DATE      object
TEXT      object
dtype: object

In [4]:
# Adjust Date Format
wiki['DATE'] = pd.to_datetime(wiki['DATE'], infer_datetime_format=True)

In [5]:
# Get Month Variable
wiki['MONTH'] = pd.to_datetime(wiki['DATE']).dt.month

Since elections generally last for 7 days and will wait a few months to run again for the position of administrator after losing an election. Hence, the combination of 'TARGET', 'YEAR' and 'MONTH' should be sufficient to identify each election.

In [6]:
# Report Features with missing values - training data
for i in wiki.columns:
    if wiki[i].isnull().sum() != 0:
        print(i, ' : ', wiki[i].isnull().sum())
    else:
        print(i, ' : No missing data')

SOURCE  :  1646
TARGET  : No missing data
VOTE  : No missing data
RESULT  : No missing data
YEAR  : No missing data
DATE  :  9243
TEXT  :  7028
MONTH  :  9243


In [7]:
print(wiki[wiki['SOURCE'].isnull()])

       SOURCE             TARGET  VOTE  RESULT  YEAR DATE TEXT  MONTH
707       NaN        Jason Quinn     0       1  2013  NaT  NaN    NaN
708       NaN        Jason Quinn     0       1  2013  NaT  NaN    NaN
793       NaN            Legoktm     1       1  2013  NaT  NaN    NaN
1126      NaN      Mattythewhite    -1       1  2013  NaT  NaN    NaN
1128      NaN      Mattythewhite    -1       1  2013  NaT  NaN    NaN
...       ...                ...   ...     ...   ...  ...  ...    ...
197377    NaN             Yelyos     1      -1  2004  NaT  NaN    NaN
197441    NaN  Christopher Mahan     1       1  2003  NaT  NaN    NaN
197496    NaN       Jwrosenzweig     1       1  2003  NaT  NaN    NaN
197542    NaN            Patrick     1       1  2003  NaT  NaN    NaN
197572    NaN         Stevertigo     1       1  2003  NaT  NaN    NaN

[1646 rows x 8 columns]


In [8]:
print(wiki[wiki['DATE'].isnull()])

                SOURCE         TARGET  VOTE  RESULT  YEAR DATE  \
707                NaN    Jason Quinn     0       1  2013  NaT   
708                NaN    Jason Quinn     0       1  2013  NaT   
793                NaN        Legoktm     1       1  2013  NaT   
969        Majoreditor      Lord Roem     1       1  2013  NaT   
1126               NaN  Mattythewhite    -1       1  2013  NaT   
...                ...            ...   ...     ...   ...  ...   
197590  Daniel Quinlan         Ugen64     1       1  2003  NaT   
197591        Delirium         Ugen64     1       1  2003  NaT   
197593          Menchi   Vancouverguy     1       1  2003  NaT   
197597           Jiang    WhisperToMe     1       1  2003  NaT   
197599        Jimregan        Zanimum     1       1  2003  NaT   

                                                     TEXT  MONTH  
707                                                   NaN    NaN  
708                                                   NaN    NaN  
793   

The inspection of the data suggests that these observations are missing at random, created by used who were not logged into their account when "voting". If the conclusion that these observations are missing at random is correct then these rows cannot be reliably imputed, and dropping these observations will not significantly impact the nature of the data.

In [9]:
# Remove missing values
wiki = wiki.dropna()
wiki.shape

(184104, 8)

In [10]:
# Number of voters
wiki['SOURCE'].nunique()

10053

In [11]:
# Number of nominee
wiki['TARGET'].nunique()

3445

In [12]:
# Vote Counts
wiki['VOTE'].value_counts()

 1    133609
-1     39002
 0     11493
Name: VOTE, dtype: int64

In [13]:
# Encode votes
wiki_enc = pd.get_dummies(wiki, columns = ['VOTE'], drop_first=False)
wiki_enc.head()

Unnamed: 0,SOURCE,TARGET,RESULT,YEAR,DATE,TEXT,MONTH,VOTE_-1,VOTE_0,VOTE_1
0,Steel1943,BDD,1,2013,2013-04-19 23:13:00,'''Support''' as co-nom.,4.0,0,0,1
1,Cuchullain,BDD,1,2013,2013-04-20 01:04:00,'''Support''' as nominator.--,4.0,0,0,1
2,INeverCry,BDD,1,2013,2013-04-19 23:43:00,'''Support''' per noms.,4.0,0,0,1
3,Cncmaster,BDD,1,2013,2013-04-20 00:11:00,'''Support''' per noms. BDD is a strong contri...,4.0,0,0,1
4,Miniapolis,BDD,1,2013,2013-04-20 00:56:00,"'''Support''', with great pleasure. I work wit...",4.0,0,0,1


In [14]:
# Result Counts
wiki['RESULT'].value_counts()

 1    113316
-1     70788
Name: RESULT, dtype: int64

In [15]:
# Check for self-loops
self_vote = wiki[wiki['SOURCE'] == wiki['TARGET']]
self_vote.shape

(87, 8)

In [16]:
print('The included time range is: ' + str(wiki['DATE'].dt.date.min()) + ' to ' + str(wiki['DATE'].dt.date.max()))

The included time range is: 2003-08-16 to 2013-06-05


## Text Analysis

In [17]:
# Instanciate Stemmer
porter = SnowballStemmer("english")

# Instanciate Lemmetizer
lmtzr = WordNetLemmatizer()

# Set Stropwords - Combination of generic english stopwords and list of words noticed to be too generic when viewing bigram and
# trigram outputs during vectorization
stop_words = set(stopwords.words('english') + ['support', 'oppose', 'user', 'admin', 'candidate', 'wikipedia', 'color',
                                              'http', 'php', 'org', 'luck', 'font', 'see', 'green', 'would', 'e'])

# Function to get text in desired lowercase form
def abbr_or_lower(word):
    if re.match('([A-Z]+[a-z]*){2,}', word):
        return word
    else:
        return word.lower()

# Function for different forms of tokeniztion (all 3 forms tested for performace)
def tokenize(words, modulation):
    tokens = re.split(r'\W+', words)
    stems = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        lowers=abbr_or_lower(token)
        if lowers not in stop_words:
            if re.search('[a-zA-Z]', lowers):
                if modulation==1:
                    stems.append(porter.stem(lowers))
                if modulation==2:
                    stems.append(lmtzr.lemmatize(lowers))
                if modulation==0:
                    stems.append(lowers)
    return stems

In [18]:
# Isolate Text
text = wiki.TEXT

# Apply Pre-Processing
comments = [tokenize(comment, 2) for comment in text]

In [19]:
# Instanciate TfidfVectorizer (tokenizer set to iterate over dcomments)

tfidf1 = TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda doc: doc, lowercase=False) # Unigrams
tfidf2 = TfidfVectorizer(ngram_range=(2, 2), tokenizer=lambda doc: doc, lowercase=False) # Bigrams
tfidf3 = TfidfVectorizer(ngram_range=(3, 3), tokenizer=lambda doc: doc, lowercase=False) # Trigrams

In [20]:
# Unigrams

com_uni = tfidf1.fit_transform(comments)

uni_df = pd.DataFrame.sparse.from_spmatrix(com_uni, columns=tfidf1.get_feature_names())
uni_df = uni_df.sum(axis=0)
uni_df = uni_df.sort_values(axis=0, ascending=False)
print(uni_df.head(10))

good          7300.218712
per           5236.580255
editor        3838.287229
strong        3029.827766
tool          3008.025322
great         2779.936631
experience    2573.997252
reason        2503.974985
answer        2426.053530
look          2397.888122
dtype: float64


In [21]:
# Bigrams

com_bi = tfidf2.fit_transform(comments)

bi_df = pd.DataFrame.sparse.from_spmatrix(com_bi, columns=tfidf2.get_feature_names())
bi_df = bi_df.sum(axis=0)
bi_df = bi_df.sort_values(axis=0, ascending=False)
print(bi_df.head(10))

span style          1187.838081
look good           1115.587776
per nom             1026.454993
good editor          985.524234
answer question      852.592994
abuse tool           732.656185
style family         611.193345
make good            532.632393
style background     527.637006
good work            466.063411
dtype: float64


In [22]:
# Trigrams

com_tri = tfidf3.fit_transform(comments)

tri_df = pd.DataFrame.sparse.from_spmatrix(com_tri, columns=tfidf3.get_feature_names())
tri_df = tri_df.sum(axis=0)
tri_df = tri_df.sort_values(axis=0, ascending=False)
print(tri_df.head(10))

span style family        513.337341
en w index               423.533323
w index title            418.049257
style background gold    376.643787
span style border        330.012564
border 1px solid         299.672212
style border 1px         284.016844
small span style         235.807061
unlikely abuse tool      226.084665
background gold WP       210.933182
dtype: float64


## Analyse Voting by Target

In [23]:
nominees = wiki_enc.groupby(['TARGET']).agg(SOURCE=('SOURCE', 'count'), VOTE_AGAINST=('VOTE_-1', 'sum'), 
                                              VOTE_NEUTRAL=('VOTE_0', 'sum'), VOTE_FOR=('VOTE_1', 'sum'),
                                              RESULT=('RESULT', pd.Series.max))
nominees.head()

Unnamed: 0_level_0,SOURCE,VOTE_AGAINST,VOTE_NEUTRAL,VOTE_FOR,RESULT
TARGET,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
(aeropagitica),51,10.0,5,36.0,1
07bargem,6,6.0,0,0.0,-1
1234r00t,6,6.0,0,0.0,-1
168...,6,0.0,0,6.0,1
21655,24,18.0,2,4.0,-1


In [24]:
print('Number of Elections :', nominees.shape[0])
print('Number of Elections Won and Lost :')
nominees['RESULT'].value_counts()

Number of Elections : 3445
Number of Elections Won and Lost :


 1    1860
-1    1585
Name: RESULT, dtype: int64

In [25]:
nominees_list = set(wiki['TARGET'].tolist())
voters_list = wiki['SOURCE'].tolist()

In [26]:
# Number of times a nominee voted for another
target_votes = []
for i in nominees_list:
    target_votes.append(voters_list.count(i))

In [27]:
nominees['TARGET_VOTES'] = target_votes

In [28]:
# Overall Descriptive Statistics
nominees.describe()

Unnamed: 0,SOURCE,VOTE_AGAINST,VOTE_NEUTRAL,VOTE_FOR,RESULT,TARGET_VOTES
count,3445.0,3445.0,3445.0,3445.0,3445.0,3445.0
mean,53.440929,11.321335,3.336139,38.783454,0.079826,33.701887
std,61.735083,18.51787,5.328107,47.367703,0.996954,73.127295
min,1.0,0.0,0.0,0.0,-1.0,0.0
25%,11.0,1.0,0.0,2.0,-1.0,0.0
50%,39.0,5.0,1.0,23.0,1.0,6.0
75%,74.0,14.0,4.0,61.0,1.0,36.0
max,943.0,329.0,58.0,562.0,1.0,1227.0


In [29]:
# Descriptive Statistics for those who lost elections
nominees_lost = nominees[nominees['RESULT'] == -1]
nominees_lost[['SOURCE', 'VOTE_AGAINST', 'VOTE_NEUTRAL', 'VOTE_FOR']].describe()

Unnamed: 0,SOURCE,VOTE_AGAINST,VOTE_NEUTRAL,VOTE_FOR
count,1585.0,1585.0,1585.0,1585.0
mean,33.948265,16.246688,4.056782,13.644795
std,49.161477,18.152423,5.676289,28.665136
min,1.0,0.0,0.0,0.0
25%,7.0,5.0,0.0,0.0
50%,15.0,11.0,2.0,2.0
75%,43.0,21.0,6.0,14.0
max,761.0,255.0,58.0,448.0


In [30]:
# Descriptive Statistics for those who won elections
nominees_won = nominees[nominees['RESULT'] == 1]
nominees_won[['SOURCE', 'VOTE_AGAINST', 'VOTE_NEUTRAL', 'VOTE_FOR']].describe()

Unnamed: 0,SOURCE,VOTE_AGAINST,VOTE_NEUTRAL,VOTE_FOR
count,1860.0,1860.0,1860.0,1860.0
mean,70.051613,7.124194,2.722043,60.205376
std,66.34266,17.782172,4.931445,49.584327
min,1.0,0.0,0.0,0.0
25%,30.0,0.0,0.0,26.0
50%,58.0,1.0,1.0,53.0
75%,91.0,6.0,3.0,81.25
max,943.0,329.0,53.0,562.0


## Graph Analysis

In [31]:
G_wiki = nx.from_pandas_edgelist(wiki, 'SOURCE', 'TARGET', edge_attr=['VOTE', 'RESULT', 'DATE'], 
                                 create_using=nx.MultiDiGraph())
print(nx.info(G_wiki))

MultiDiGraph with 11056 nodes and 184104 edges


In [32]:
# Some measures not available for MultiDiGraph type
G_wiki_di = nx.from_pandas_edgelist(wiki, 'SOURCE', 'TARGET', edge_attr=['VOTE', 'RESULT', 'DATE'], 
                                    create_using=nx.DiGraph())
print(nx.info(G_wiki_di))

DiGraph with 11056 nodes and 176331 edges


*Node Properties*

Node properties chosen based on referenced measures used to identify key players.

In [33]:
wikipedian = [node for (node, val) in G_wiki.degree()]
degree = [val for (node, val) in G_wiki.degree()]
in_degree = [val for (node, val) in G_wiki.in_degree()]
out_degree = [val for (node, val) in G_wiki.out_degree()]

eigen_centrality = nx.eigenvector_centrality(G_wiki_di)
eigen_centrality = [eigen_centrality[node] for node in eigen_centrality]

#katz_centrality = nx.katz_centrality(G_wiki_di, max_iter=10000) # Max Iterations reached without convergence
#katz_centrality = [katz_centrality[node] for node in katz_centrality]

In [34]:
# Add to DataFrame
node_properties = pd.DataFrame()
node_properties['WIKIPEDIAN'] = wikipedian
node_properties['DEGREE'] = degree
node_properties['IN_DEGREE'] = in_degree
node_properties['OUT_DEGREE'] = out_degree
node_properties['EIGEN_CENTRALITY'] = eigen_centrality

In [35]:
node_properties = node_properties.assign(**dict.fromkeys(['NOMINEE'], 0))

for i in nominees_list:
    node_properties.loc[node_properties.WIKIPEDIAN == i, 'NOMINEE'] = 1
    
node_properties.head()

Unnamed: 0,WIKIPEDIAN,DEGREE,IN_DEGREE,OUT_DEGREE,EIGEN_CENTRALITY,NOMINEE
0,Steel1943,4,0,4,1.0586960000000001e-39,0
1,BDD,141,136,5,0.0249929,1
2,Cuchullain,62,60,2,0.01562839,1
3,INeverCry,132,109,23,0.02427809,1
4,Cncmaster,21,0,21,1.0586960000000001e-39,0


In [36]:
# Overall description
node_properties[['DEGREE', 'IN_DEGREE', 'OUT_DEGREE', 'EIGEN_CENTRALITY']].describe()

Unnamed: 0,DEGREE,IN_DEGREE,OUT_DEGREE,EIGEN_CENTRALITY
count,11056.0,11056.0,11056.0,11056.0
mean,33.303907,16.651954,16.651954,0.003520184
std,74.224253,42.426237,46.772764,0.008835383
min,1.0,0.0,0.0,1.0586960000000001e-39
25%,1.0,0.0,1.0,1.0586960000000001e-39
50%,4.0,0.0,2.0,1.0586960000000001e-39
75%,29.0,9.0,11.0,0.001182781
max,1645.0,943.0,1227.0,0.1283127


In [37]:
# Description for nominees

node_properties_nominee = node_properties[node_properties['NOMINEE'] == 1]
node_properties_nominee[['DEGREE', 'IN_DEGREE', 'OUT_DEGREE', 'EIGEN_CENTRALITY']].describe()

Unnamed: 0,DEGREE,IN_DEGREE,OUT_DEGREE,EIGEN_CENTRALITY
count,3445.0,3445.0,3445.0,3445.0
mean,87.142816,53.440929,33.701887,0.01129729
std,110.544445,61.735083,73.127295,0.01275517
min,1.0,1.0,0.0,2.3291319999999998e-38
25%,15.0,11.0,0.0,0.001839929
50%,56.0,39.0,6.0,0.007267859
75%,116.0,74.0,36.0,0.01705556
max,1645.0,943.0,1227.0,0.1283127


In [38]:
# Description for those voting only

node_properties_voters = node_properties[node_properties['NOMINEE'] == 0]
node_properties_voters[['DEGREE', 'IN_DEGREE', 'OUT_DEGREE', 'EIGEN_CENTRALITY']].describe()

Unnamed: 0,DEGREE,IN_DEGREE,OUT_DEGREE,EIGEN_CENTRALITY
count,7611.0,7611.0,7611.0,7611.0
mean,8.934568,0.0,8.934568,1.0586960000000001e-39
std,23.806688,0.0,23.806688,1.1028490000000002e-52
min,1.0,0.0,1.0,1.0586960000000001e-39
25%,1.0,0.0,1.0,1.0586960000000001e-39
50%,2.0,0.0,2.0,1.0586960000000001e-39
75%,6.0,0.0,6.0,1.0586960000000001e-39
max,472.0,0.0,472.0,1.0586960000000001e-39


*Graph Structure*

In [39]:
# Density
nx.density(G_wiki)

0.001506282559050557

In [40]:
# Communities

mod_com = greedy_modularity_communities(G_wiki)
print('Number of Communities based on Greedy Modularity: ', len(mod_com))

Number of Communities based on Greedy Modularity:  25


*Reciprocity*

In [41]:
recip = nx.reciprocity(G_wiki)
print('Reciprocity in the Overall Network: ', round(recip, 4))

Reciprocity in the Overall Network:  0.0687


In [42]:
wiki_nodes = list(G_wiki.nodes)

n_recip = []
for i in wiki_nodes:
    r = nx.reciprocity(G_wiki, nodes=i)
    n_recip.append(r)

In [43]:
print('The Number of Nodes is ', len(wiki_nodes))
print('Number of Nominations was ', len(nominees_list))
print('The Number of Nodes with No Reciprocal Relationships are ', n_recip.count(0.0))
print('The Number of Nodes with With Reciprocal Relationships are ', len(wiki_nodes) - n_recip.count(0.0))

The Number of Nodes is  11056
Number of Nominations was  3445
The Number of Nodes with No Reciprocal Relationships are  9364
The Number of Nodes with With Reciprocal Relationships are  1692


In [44]:
recip_df = pd.DataFrame()
recip_df['WIKIPEDIAN'] = wiki_nodes
recip_df['RECIPROCITY'] = n_recip

In [45]:
# Flag Nominees, Winners and Losers

recip_df = recip_df.assign(**dict.fromkeys(['NOMINEE'], 0))
for i in nominees_list:
    recip_df.loc[recip_df.WIKIPEDIAN == i, 'NOMINEE'] = 1
    
recip_df = recip_df.assign(**dict.fromkeys(['WIN'], 0))
winners = wiki[wiki['RESULT'] == 1]
winners = winners['TARGET'].tolist()
for i in winners:
    recip_df.loc[recip_df.WIKIPEDIAN == i, 'WIN'] = 1
    
recip_df = recip_df.assign(**dict.fromkeys(['LOSE'], 0))
losers = wiki[wiki['RESULT'] == -1]
losers = losers['TARGET'].tolist()
for i in losers:
    recip_df.loc[recip_df.WIKIPEDIAN == i, 'LOSE'] = 1

In [46]:
# Analyse those with reciprocal relationships

recip_df = recip_df[recip_df['RECIPROCITY'] != 0.0]
recip_df.describe()

Unnamed: 0,RECIPROCITY,NOMINEE,WIN,LOSE
count,1692.0,1692.0,1692.0,1692.0
mean,0.096851,1.0,0.751182,0.368794
std,0.064044,0.0,0.432456,0.482621
min,0.00211,1.0,0.0,0.0
25%,0.047619,1.0,1.0,0.0
50%,0.083436,1.0,1.0,0.0
75%,0.130329,1.0,1.0,1.0
max,0.5,1.0,1.0,1.0


In [47]:
loss = recip_df[recip_df['LOSE'] == 1]
print('Number of Election Loser with Reciprocity: ', loss.shape[0])

loss2win = loss[loss['WIN'] == loss['LOSE']]
print('Number of Losers with Reciprocity turned Winners: ', loss2win.shape[0])

Number of Election Loser with Reciprocity:  624
Number of Losers with Reciprocity turned Winners:  203
