# dataset preparation

In [1]:
# -*- coding: utf-8 -*-

import json
import codecs
from bs4 import BeautifulSoup

import pandas as pd

specialization = json.load(codecs.open("data/wikipedia-geometry/specialization.json","r", "utf-8-sig"))

## basic processing
### word analysis

In [2]:
def basic_word_analysis(text):
    length = len(text)
    words = len(content.split(" "))
    average_word_length = float(length - words)/float(words)
    
    return length, words, average_word_length

This function performs a basic word analysis using raw python functions. Word counting will be more precise using a NLP framework such as `nltk`. Further development will certainly be done in more developed phases of the project and documented in an according notebook.

### users

In [3]:
def basic_user_stats(page):
    revs = json.load(codecs.open("data/wikipedia-geometry/revisions/%s.json" % (page), "r", "utf-8-sig"))
    
    unique_users = 0
    unique_registered_users = 0
    unique_ip_users = 0
    hidden_users = 0
    
    #users = set()
    registered_users = set()
    ip_users = set()
    
    for r in revs:
        if "user" in r:
            if r["userid"] != 0:
                registered_users.add(r["user"])
            else:
                ip_users.add(r["user"])
        else:
            hidden_users += 1

    first_revision = revs[-1]["timestamp"]
            
    unique_registered_users = len(registered_users)
    unique_ip_users = len(ip_users)
    unique_users = unique_registered_users + unique_ip_users
    
    return unique_users, unique_registered_users, unique_ip_users, hidden_users, first_revision

### page views

In [4]:
def pageviews(page):
    pageviews = json.load(codecs.open("data/wikipedia-geometry/pageviews/%s.json" % (page), "r", "utf-8-sig"))
    
    total = 0
    
    for m in pageviews:
        for d, v in m.iteritems():
            total += v
    
    return total

## finalization

In [5]:
# for p in specialization[0:25]:
for p in specialization:
    content_file = codecs.open("data/wikipedia-geometry/pages/%s.json" % (p["pagename"]), "r", "utf-8-sig")
    j = json.load(content_file)
    r = j["query"]["pages"][j["query"]["pages"].keys()[0]]

    if "revisions" in r.keys():
        content = r["revisions"][0]["*"]       
        content = BeautifulSoup(content).text
        
        p["length"], p["words"], p["average word length"] = basic_word_analysis(content)
        p["unique users"], p["unique registered users"], p["unique ip users"], p["hidden users"],p["first revision"] = basic_user_stats(p["pagename"])
        p["page views"] = pageviews(p["pagename"])
        

We just perform a small cosmetic change in column name by replacing `.` by `space`. This allow a more natural reading and also better formating of tables.

In [6]:
table = pd.DataFrame(specialization)

#table.head(25) ## always this stupid utf-8 problem...
table = table.rename(columns=lambda x: x.replace(".", " "))

table = table.set_index("pagename")

## preview

In [7]:
table.head(10)

Unnamed: 0_level_0,average word length,first revision,hidden users,length,number of backlinks,number of backlinks from ns 0,number of categories,number of links,number of revisions,number of revisions by IP,number of revisions by alive registered user,page views,quality of the article,specialization,unique ip users,unique registered users,unique users,words
pagename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2D computer graphics,5.912321,2001-10-13T06:23:27Z,0,24124,0,0,7,403,370,124,246,927245,4,0.1909,82,139,221,3490
3D computer graphics,6.360368,2007-03-21T05:56:20Z,0,8803,0,0,8,195,893,368,525,3169981,4,0.0627,258,240,498,1196
3-sphere,5.071429,2002-02-19T11:12:25Z,0,17595,0,0,7,116,277,59,218,486968,4,0.3721,44,117,161,2898
Affine geometry,5.424357,2003-06-11T09:28:43Z,0,15245,0,0,1,97,205,40,165,277337,3,0.3904,30,82,112,2373
Algebraic geometry,5.734462,2002-02-25T15:51:15Z,0,46919,1375,1009,1,379,626,189,437,810499,4,0.1887,153,213,366,6967
Altitude (triangle),5.543959,2002-12-24T06:11:40Z,0,12728,0,0,2,59,356,158,198,718973,3,0.0635,117,128,245,1945
Analytic geometry,5.54427,2002-02-25T15:51:15Z,0,29122,0,0,1,193,477,191,286,1006563,4,0.0972,150,176,326,4450
Angle,5.132834,2002-02-25T15:43:11Z,0,32780,0,0,11,192,1742,704,1038,3979866,4,0.0337,484,468,952,5345
Angle trisection,5.515563,2002-09-25T07:26:51Z,0,14862,0,0,7,163,434,110,324,261975,3,0.2845,67,127,194,2281
Aperiodic tiling,6.034483,2004-07-30T22:12:36Z,0,18972,0,0,2,227,277,62,215,175423,4,0.3557,41,104,145,2697


## storage

We are storing our intermediary data in `csv` instead of `json` for readability purposes. For example, github includes a web table viewer directly into its main interface. It is also more convenient to share and open csv files within the data processing pipeline since we are mainly doing table calculus more than object manipulation.

In [8]:
table.to_csv("data/wikipedia-geometry/final.csv", encoding="UTF-8")