In [1]:
import process_functions as pf

In [2]:
import pandas as pd
import numpy as np
import re
import datetime

# 1. Read Data

In [3]:
input_data = pd.read_csv('ODI-2019-csv.csv', sep=';')

# 2. Pre-Process Data

In [4]:
# Copy data dataframe to adapt
data_processed = input_data[input_data.columns[[0,2,3,4,5,6,7,14,15]]].applymap(str.lower)

## 2.1 Normalize Programmes

In [5]:
data_programme = input_data[input_data.columns[1]].str.lower().str.strip()

In [6]:
for key in pf.replacement:
    data_programme = data_programme.replace(to_replace = pf.replacement[key], value = key)

In [7]:
data_processed = pd.concat([data_processed,data_programme],axis=1)

## 2.2 Normalize Birthdates

In [8]:
data_birthdays = input_data[input_data.columns[8]].str.lower().str.strip()

In [9]:
dates = []
for date in data_birthdays:    
    dates.append(pf.splitbirthday(date))

In [11]:
data_birthdays = pd.DataFrame(dates,columns=['Birth Day','Birth Month','Birth Year']).replace(r'', None, regex=True)

for column in data_birthdays.columns:
    data_birthdays[column] = pd.to_numeric(data_birthdays[column])

In [12]:
data_processed = pd.concat([data_processed,data_birthdays],axis=1)

## 2.3 Normalize Neighbors, DM Competition, Random Number, Stress Level

In [13]:
data_neighbors =  []
data_competition = []
data_random = []
data_stress = []

for index, row in input_data.iterrows():
    data_neighbors.append(pf.removenonnum(row[9], True))
    data_competition.append(pf.removenonnum(row[11]))
    data_random.append(pf.removenonnum(row[12]))
    data_stress.append(pf.removenonnum(str(row[16])))

data_neighbors = [(x if x else None) for x in data_neighbors]
data_competition = [(x if x else None) for x in data_competition]
data_random = [(x if x else None) for x in data_random]
data_stress = [(x if x else None) for x in data_stress]

data_numbers = pd.DataFrame({'Neighbors':data_neighbors, 'Competition':data_competition, 'Random':data_random, 'Stress':data_stress})

for column in data_numbers.columns:
    data_numbers[column] = pd.to_numeric(data_numbers[column])

In [14]:
data_processed = pd.concat([data_processed,data_numbers],axis=1)

## 2.4 Normalize Bedtime

In [23]:
data_bedtime = input_data[input_data.columns[13]].str.lower().str.strip()

In [24]:
for index, time in data_bedtime.iteritems():
    if "am" in time:
        time = pf.fixtime(time, "am")
    elif "pm" in time:
        time = pf.fixtime(time, "pm")
    else:
        time = pf.fixtime(time)
    try:
        time.split(":")[0]
    except:
        data_bedtime.iloc[index] = None
    else:
        data_bedtime.iloc[index] = time.split(":")[0]
    #data_bedtime.iloc[index] = time

In [26]:
data_bedtime = pd.to_numeric(data_bedtime)

In [27]:
data_processed = pd.concat([data_processed,data_bedtime],axis=1)

# 3. Save Pre-Processed Data

In [28]:
data_processed.dtypes

Timestamp                                             object
Have you taken a course on machine learning?          object
Have you taken a course on information retrieval?     object
Have you taken a course on statistics?                object
Have you taken a course on databases?                 object
What is your gender?                                  object
Chocolate makes you.....                              object
What makes a good day for you (1)?                    object
What makes a good day for you (2)?                    object
What programme are you in?                            object
Birth Day                                            float64
Birth Month                                          float64
Birth Year                                           float64
Neighbors                                            float64
Competition                                          float64
Random                                               float64
Stress                  

In [29]:
data_processed.to_csv('ODI-2019-processed.csv', header=True, index=False)