Load Data From CSVs

In [43]:
import unicodecsv

def read_csv(filename):
    with open(filename,'rb') as f:
        reader=unicodecsv.DictReader(f)
        enrollments=list(reader)
        return enrollments
    
enrollments=read_csv('/Users/pein/Desktop/enrollments.csv')
daily_engagement=read_csv('/Users/pein/Desktop/daily_engagement.csv')
project_submissions=read_csv('/Users/pein/Desktop/project_submissions.csv')


Fixing Data Type

In [44]:
from datetime import datetime as dt
# Takes a date as string,and return a Python datetime Obejct
# If there is no date given, return None
def parse_date(date):
    if date=='':
        return None
    else:
        return dt.strptime(date,'%Y-%m-%d')

def parse_maybe_int(i):
    if i=='':
        return None
    else:
        return int(i)

# Let's try to clean the data type in enrollment table
for enrollment in enrollments:
    enrollment['cancel_date']=parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel']=parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled']=='True'
    enrollment['is_udacity']=enrollment['is_udacity']=='True'
    enrollment['join_date']=parse_date(enrollment['join_date'])

daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [48]:
# Clean up the data types in engagement
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [49]:
# clean up the data types in submission table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

### Investing the Data

In [54]:
def get_unique_students(data):
    unique_student=set()
    for data_point in data:
        unique_student.add(data_point['account_key'])
    return unique_student
len(enrollments)
unique_student_enrollments=get_unique_students(enrollments)
len(unique_student_enrollments)
unique_student_engagement=get_unique_students(daily_engagement)

In [53]:
for engagement_record in daily_engagement:
    engagement_record['account_key']=engagement_record['acct']
    del[engagement_record['acct']]


In [57]:
for enrollment in enrollments:
    student=enrollment['account_key']
    if student not in unique_student_engagement:
        print(enrollment)
        break

OrderedDict([('account_key', '1219'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 12, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 12, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)])
