In [1]:
%pylab --no-import-all
from os import path
import pandas as pd

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [2]:
try:
    file = path.join("..", "data", "raw", "london.csv")
except OSError:
    print("This repository does not host the data. "
          "Put the csv in ../data/raw/")
    raise
df = pd.read_csv(file, na_values=0)

Rename Italian columns to English.

In [3]:
df.rename(columns={'DONNA': 'ID',
                   'P_SPEZZ': 'SEGMENT_ID',
                   'P_CICLO': 'CYCLE_ID',
                   'ANNO_NAS': 'BIRTH_YR',
                   'DATA': 'BEGIN_DATE',
                   'T_SPEZZ': 'N_SEGMENTS',
                   'T_CICLI': 'N_CYCLES',
                   'QUALIFI': 'DESC',
                   'TIPOTEMP': 'TEMP_SCALE',
                   'L_CICLO': 'L_CYCLE',
                   'L_PREOV': 'L_PREOVULATION',
                   'L_PERIOD': 'L_PERIOD',
                   'FIGLI': 'CHILDREN'
                  },
         inplace=True)

We delete the cycles (rows) where the `L_PREOV` is missing (NA), because this data is unusable - we don't have the variable we seek to predict. **Ex:** If row 100 is deleted, the index would be $..., 98, 99, 101, 102, ...$

After these rows are dropped, there are holes created in the index of the data frame. We would like the index numbers to be consecutive, so we use the `reset_index` function.

In [4]:
df.dropna(subset=['L_PREOVULATION'], inplace=True)

Delete those where DESC != 1 because those are flawed entries.

In [5]:
df = df[df.DESC == 1]
df.drop('DESC', 1, inplace=True)

Convert all measurements to Fahrenheit.

In [6]:
FAHRENHEIT = 1
CELSIUS = 2
df = df[df.TEMP_SCALE==FAHRENHEIT]
#for i in range(1, 100):
#    df.ix[df.TEMP_SCALE != 1, 'TEMP' + str(i)] = 32 + 9 / 5 * df.ix[df.TEMP_SCALE != 1, 'TEMP' + str(i)]
#df.drop(['TEMP_SCALE'], axis='columns', inplace=True)

Create an age measurement for each cycle, computed from the birth year and measurement date.

In [7]:
df['AGE'] = (df.BEGIN_DATE.apply(lambda s: int(s.split('/')[-1]) % 100 if isinstance(s, str) else s) - df.BIRTH_YR)

In [8]:
import errno
import os

destination = ["..", "data", "interim"]
df.to_csv(path.join(*destination, "df.csv"))

In [9]:
df.AGE

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
7         NaN
8         NaN
9         NaN
10        NaN
11        NaN
12        NaN
13        NaN
14        NaN
15        NaN
16        NaN
18        NaN
19        NaN
20        NaN
21        NaN
22        NaN
23        NaN
24        NaN
25        NaN
27        NaN
28        NaN
29        NaN
31        NaN
32        NaN
         ... 
34432    42.0
34433    42.0
34434    42.0
34435    42.0
34436    42.0
34438    42.0
34439    42.0
34440    42.0
34441    43.0
34442    43.0
34443    43.0
34444    43.0
34445    43.0
34446    43.0
34447    43.0
34448    43.0
34575    32.0
34589    33.0
34590    33.0
34591    33.0
34592    34.0
34599    34.0
34600    34.0
34601    34.0
34602    34.0
35850     NaN
35949     NaN
35950     NaN
35951     NaN
35952     NaN
Name: AGE, dtype: float64