# dataset preparation

In [2]:
# -*- coding: utf-8 -*-

import json
import codecs
from bs4 import BeautifulSoup

import pandas as pd

pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)

## basic processing
### word analysis

In [8]:
def basic_word_analysis(text):
    length = len(text)
    words = len(content.split(" "))
    average_word_length = float(length - words)/float(words)
    
    return length, words, average_word_length

This function performs a basic word analysis using raw python functions. Word counting will be more precise using a NLP framework such as `nltk`. Further development will certainly be done in more developed phases of the project and documented in an according notebook.

### users

In [5]:
def basic_user_stats(page):
    revs = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
    
    unique_users = 0
    unique_registered_users = 0
    unique_ip_users = 0
    hidden_users = 0
    
    #users = set()
    registered_users = set()
    ip_users = set()
    
    for r in revs:
        if "user" in r:
            if r["userid"] != 0:
                registered_users.add(r["user"])
            else:
                ip_users.add(r["user"])
        else:
            hidden_users += 1

    first_revision = revs[-1]["timestamp"]
            
    unique_registered_users = len(registered_users)
    unique_ip_users = len(ip_users)
    unique_users = unique_registered_users + unique_ip_users
    
    return unique_users, unique_registered_users, unique_ip_users, hidden_users, first_revision

print basic_user_stats("Pi")

(2880, 1799, 1081, 0, u'2002-02-01T12:26:39Z')


### page views

In [4]:
def pageviews(page):
    pageviews = json.load(codecs.open("data/pageviews/%s.json" % (page), "r", "utf-8-sig"))
    
    total = 0
    
    for m in pageviews:
        for d, v in m.iteritems():
            total += v
    
    return total

print pageviews("Pi")

23309406


### pagerank

In [24]:
pageranks = pd.DataFrame.from_csv("data/pagerank.csv", sep=";", encoding="utf-8")

### specialization

In [37]:
specialization = pd.read_json("data/specialization.json")
specialization = specialization.set_index("title")
#specialization.head()

### co-editors graph

In [38]:
coeditors = pd.DataFrame.from_csv("data/pages-linked-by-coeditors.stats.csv", encoding="utf-8")

## finalization

In [25]:
# for p in specialization[0:25]:
data = []

for page in pages:
    p = {}
    
    p["pagename"] = page
    
    content_file = codecs.open("data/pages/%s.json" % (page), "r", "utf-8-sig")
    j = json.load(content_file)
    r = j["query"]["pages"][j["query"]["pages"].keys()[0]]

    if "revisions" in r.keys():
        content = r["revisions"][0]["*"]       
        content = BeautifulSoup(content).text
        
        p["length"], p["words"], p["average word length"] = basic_word_analysis(content)
        p["unique users"], p["unique registered users"], p["unique ip users"], p["hidden users"],p["first revision"] = basic_user_stats(page)
        p["page views"] = pageviews(page)
        
    data.append(p)

In [47]:
table = pd.DataFrame(data)
table = table.set_index("pagename")
table = table.join(pageranks)
table = table.join(specialization)
table = table.join(coeditors , rsuffix=" (co-editors)")

## preview

In [48]:
table.head(10)

Unnamed: 0_level_0,average word length,first revision,hidden users,length,page views,unique ip users,unique registered users,unique users,words,Pagerank pro 0.8,...,nbcontributorsBot,nbcontributorsIP,nbcontributorsMembers,nbrevisions,nbrevisionsBot,nbrevisionsIP,nbrevisionsMembers,ns,pageid,quality
pagename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2D computer graphics,5.912321,2001-10-13T06:23:27Z,0,24124,930542,83,139,222,3490,2.540488,...,18.0,78.0,119.0,370.0,23.0,124.0,223.0,0,35248,4.0
2D geometric model,5.786585,2004-03-08T00:49:59Z,0,1113,155960,13,28,41,164,1.846966,...,3.0,13.0,24.0,54.0,3.0,20.0,31.0,0,511647,1.0
3D computer graphics,6.360368,2007-03-21T05:56:20Z,0,8803,3174611,258,240,498,1196,3.665079,...,28.0,250.0,211.0,893.0,65.0,368.0,460.0,0,10175073,4.0
3D projection,5.907763,2003-09-07T18:48:57Z,0,8987,1141382,98,113,211,1301,3.430536,...,16.0,95.0,96.0,351.0,25.0,117.0,209.0,0,313741,2.0
3-sphere,5.071429,2002-02-19T11:12:25Z,0,17595,488222,44,117,161,2898,3.693437,...,12.0,44.0,103.0,277.0,13.0,59.0,205.0,0,39792,4.0
Absolute geometry,5.561167,2004-06-02T19:31:39Z,0,5846,77451,22,59,81,891,2.870685,...,14.0,20.0,44.0,104.0,15.0,27.0,62.0,0,699294,2.0
Acute and obtuse triangles,5.818387,2014-10-10T19:11:27Z,0,9048,5549,1,2,3,1327,0.829064,...,,,,,,,,0,44076423,
Affine geometry,5.424357,2003-06-11T09:28:43Z,0,15245,277910,30,82,112,2373,3.49503,...,15.0,30.0,64.0,205.0,22.0,40.0,143.0,0,243890,3.0
Affine space,5.325827,2003-08-18T04:32:19Z,0,13202,401482,64,105,169,2087,4.01055,...,18.0,61.0,84.0,384.0,30.0,91.0,263.0,0,298834,2.0
Affine transformation,5.599894,2002-02-25T15:51:15Z,0,12454,1333126,78,139,217,1887,4.072621,...,21.0,76.0,111.0,371.0,26.0,107.0,238.0,0,38449,2.0


## storage

We are storing our intermediary data in `csv` instead of `json` for readability purposes. For example, github includes a web table viewer directly into its main interface. It is also more convenient to share and open csv files within the data processing pipeline since we are mainly doing table calculus more than object manipulation.

In [49]:
table.to_csv("data/final.csv", encoding="UTF-8")