# Outputs cleanup #

In [1]:
import pandas as pd
import re
from unidecode import unidecode
import html
from HTMLParser import HTMLParser

In [2]:
outputs = pd.read_csv("outputs.csv")
authors = pd.read_csv("authors.csv")

In [3]:
outputs.index = outputs["PUBLICATION_ID"].values

In [4]:
auths = []
for i in outputs["PUBLICATION_ID"]:
    auths.append(
        authors[authors["PUBLICATION_ID"]==i]["PERSON_ID"].values
                )

In [5]:
outputs["AUTHORS"] = auths

In [6]:
del outputs["PUBLICATION_ID"]

In [7]:
# We dont need both TYPE_NO and TYPE
del outputs["TYPE_NO"]

In [8]:
print "Number of publications: %15d" % len(outputs["PUBLICATION_MONTH"])
print "Number of NaNs: %23d" % outputs["PUBLICATION_MONTH"].count()
print "Number of available months data: %6d" % (len(outputs["PUBLICATION_MONTH"]) - outputs["PUBLICATION_MONTH"].count())

Number of publications:           34123
Number of NaNs:                   19502
Number of available months data:  14621


In [9]:
print "Number of publications: %15d" % len(outputs["PUBLICATION_YEAR"])
print "Number of NaNs: %23d" % outputs["PUBLICATION_YEAR"].count()
print "Number of available year data: %4d" % (len(outputs["PUBLICATION_YEAR"]) - outputs["PUBLICATION_YEAR"].count())

Number of publications:           34123
Number of NaNs:                   34123
Number of available year data:    0


In [10]:
del outputs["PUBLICATION_DAY"]
del outputs["PUBLICATION_MONTH"]

In [11]:
x = list(set(outputs["TYPE"]))
contrib_mapping = {}
for i in x:
    ii = i.split("/")
    if ii[0].endswith("journal"):
        contrib_mapping[i] = ("journal", ii[1])
    elif "conference" in ii[0]:
        contrib_mapping[i] = ("conference", ii[1])
    elif "periodical" in ii[0]:
        contrib_mapping[i] = ("periodical", ii[1])
    elif "book" in ii[0]:
        contrib_mapping[i] = ("book", ii[1])
    elif "paper" in ii[0]:
        contrib_mapping[i] = ("paper", ii[1])
    else:
        contrib_mapping[i] = ("other", ii[1])

outputs["TYPE"] = outputs["TYPE"].map(contrib_mapping)

In [12]:
# split between publications with and without authors
outputs_named = outputs[outputs["AUTHORS"].apply(lambda x: len(x)) > 0]
outputs_nameless = outputs[outputs["AUTHORS"].apply(lambda x: len(x)) == 0]

In [13]:
print "Nameless papers:", len(outputs_nameless)
print "Named papers:", len(outputs_named)

Nameless papers: 4460
Named papers: 29663


In [14]:
# missing keywords
print "Missing KEYWORDS: %f%%" % (100.0*outputs_named["KEYWORDS"].count()/len(outputs_named["KEYWORDS"]))
print "Missing ABSTRACT: %f%%" % (100.0*outputs_named["ABSTRACT"].count()/len(outputs_named["ABSTRACT"]))

Missing KEYWORDS: 20.689074%
Missing ABSTRACT: 50.308465%


# Text cleanup #
* It's difficult to maintain latex notation / chemical physical abbrevs etc. -- worth investigating

In [15]:
# curate the data
### Handle &amp; etc. -- all of the HTML characters
def amp(s):
    return HTMLParser().unescape(s)
outputs_named["TITLE"] = outputs_named["TITLE"].apply(amp)

## Transofrm unisode into the nearest ASCII representation
def ud(s):
    return unidecode(re.sub(r'\\(....)', r'\u\g<1>', s).decode('unicode_escape'))
outputs_named["TITLE"] = outputs_named["TITLE"].apply(ud)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [16]:
outputs_named[["AUTHORS", "PUBLICATION_YEAR"]].to_pickle("pub_auth.pickle")

In [17]:
## Remove HTML tags
###############################################
### http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

outputs_named["TITLE"] = outputs_named["TITLE"].apply(strip_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
def to_space(s):
    white_char = [".", ",", ":", "?", "!", "'", "`", "\"", \
                  "~", "\t", "\n", "#", "%", "&", "(", ")", \
                  "+", "*", ";", "=", "[", "]", "|", "{", "}", \
                  "<", ">", "\\", "/"]
    white_char.append("-")
    for i in white_char:
        s = s.replace(i, " ")
    return s
outputs_named["TITLE"] = outputs_named["TITLE"].apply(to_space)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [19]:
def to_special(s):
    s = s.replace("_", "")
    s = s.replace("^", "")
    s = s.replace("@", "a")
    # TODO: $ -> to space $$ -> remove everything in between
    s = s.replace("$", " ")
    return s
outputs_named["TITLE"] = outputs_named["TITLE"].apply(to_special)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [34]:
outputs_named["TITLE"] = outputs_named["TITLE"].apply(str.lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [35]:
outputs_named["TITLE"].head()

89664436    crossing the black atlantic for africa or how ...
55265799    central venous catheters and outcomes in hemod...
53490655                     communication as information use
56453104                                              tobacco
68562732    citizens and the co creation of public service...
Name: TITLE, dtype: object

In [38]:
def process_stirng(s):
    if isinstance(s, str):
        s = amp(s)
        s = ud(s)
        s = strip_tags(s)
        s = to_space(s)
        s = to_special(s)
        s = str.lower(s)
        return s
    else:
        return s

In [39]:
outputs_named["KEYWORDS"] = outputs_named["KEYWORDS"].apply(process_stirng)
outputs_named["ABSTRACT"] = outputs_named["ABSTRACT"].apply(process_stirng)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [40]:
outputs_named

Unnamed: 0,TITLE,TYPE,PUBLICATION_YEAR,KEYWORDS,ABSTRACT,AUTHORS
89664436,crossing the black atlantic for africa or how ...,"(book, chapter)",2012,,,[67324]
55265799,central venous catheters and outcomes in hemod...,"(book, chapter)",2012,,a significant proportion of incident and preva...,[58056]
53490655,communication as information use,"(book, chapter)",2011,,introduction uncertainty is an unavoidable pro...,"[1968, 12503]"
56453104,tobacco,"(book, chapter)",2013,,,"[27487, 22878]"
68562732,citizens and the co creation of public service...,"(book, chapter)",2013,,,[64861]
92016694,catholicism religious pluralism and education...,"(book, chapter)",2013,,,[8227]
91245910,electro induced orientational ordering of anis...,"(book, chapter)",2013,anisotropic pigment nanoparticles colloids e...,the response of anisotropic pigment particle s...,"[2472, 442]"
94611326,thorns on my tongue,"(book, chapter)",2013,,,[6863]
86474151,modulators of monocyte and macrophage phenotyp...,"(book, chapter)",2010,cd14 cd16 cd14 cd16 dendritic cells gm ...,,[11532]
52102572,data utilization in flood inundation modelling,"(book, chapter)",2010,data requirements for flood inundation modelli...,,"[28147, 8925]"


In [41]:
outputs_named.to_pickle("outputs_cleaned.pickle")