在开始之前，使用 iPython 笔记本时有几点需要注意一下：

- 如果代码单元格被运行之后，在单元格的左侧方框中会有数字出现。
- 当你启动一个新的笔记本会话时，请确保运行所有单元格（从第1个到你上次离开的单元格）。即使笔记本中依然可以看到上次运行所得到的输出，你重新运行的笔记本内核却处于一个完全新的状态，所以你需要重载数据和运行代码。
- 上一条注意事项特别重要。当你的答案和课程中的练习不一致时，请试着重载数据并一个一个运行代码单元格，以确保你所操作的变量和数据和练习中的一致。


## 从 CSV 加载数据

In [1]:
import unicodecsv

## 长代码版本 (该版本与短代码版本功能一致)

# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

In [2]:
#####################################
#                 1                 #
#####################################

## 从 daily_engagement.csv 和 project_submissions.csv 载入数据并存
## 储至下面的变量中，然后检查每张表的第1行。
def read_csv(filename):

    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)
engagement_filename = 'E:/Udacity_DA/DAND-Basic-Materials-master/P2/2.1_Files/daily-engagement.csv'
submissions_filename = 'E:/Udacity_DA/DAND-Basic-Materials-master/P2/2.1_Files/project-submissions.csv'
    
daily_engagement = read_csv(engagement_filename)     # Replace this with your code
project_submissions = read_csv(submissions_filename)  # Replace this with your code

## 修正数据类型

In [3]:
from datetime import datetime as dt

# 将字符串格式的时间转为 Python datetime 类型的时间。
# 如果没有时间字符串传入，返回 None

def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# 将可能是空字符串或字符串类型的数据转为 整型 或 None。

def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# 清理 enrollments 表格中的数据类型

for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [4]:
# 清理 engagement 的数据类型
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

{u'acct': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [5]:
# 清理 submissions 的数据类型
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 u'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

注意：当我们在运行上方的单元格时，我们已经改变了数据变量中的内容。如果你试着在同一个会话多次运行这些单元格时，可能会出错。

## 探索数据

In [6]:
#####################################
#                 2                 #
#####################################

## 计算每张表中的总行数，和独立学生（拥有独立的 account keys）的数量




def get_num_unique_students(element_csv):
    en_num=[]
    for x in element_csv:
        if x[u'account_key'] not in en_num:
            en_num.append(x[u'account_key'])
    return len(en_num)
def get_num_daily(element_csv):
    en_num=[]
    for x in element_csv:
        if x[u'acct'] not in en_num:
            en_num.append(x[u'acct'])
    return len(en_num)
    
enrollment_num_rows = len(enrollments) # Replace this with your code
enrollment_num_unique_students = get_num_unique_students(enrollments)  # Replace this with your code
# print enrollment_num_rows
# print enrollment_num_unique_students

engagement_num_rows = len(daily_engagement)             # Replace this with your code
engagement_num_unique_students = get_num_daily(daily_engagement)  # Replace this with your code

submission_num_rows = len(project_submissions)  # Replace this with your code
submission_num_unique_students = get_num_unique_students(project_submissions) # Replace this with your code
# print submission_num_unique_students
# a=set()
# a.add(1)
# a.add(2)
# a.add(1)
# print a

## 数据中的问题

In [7]:
#####################################
#                 3                 #
#####################################
for x in daily_engagement:
    x[u'account_key']=x[u'acct']
    del x[u'acct']
print daily_engagement[0][u'account_key']

## 将 daily_engagement 表中的 "acct" 重命名为 ”account_key"


0


In [8]:
def get_uniqe_student(data):
    unique_student=set()
    for data_point in data:
        unique_student.add(data_point['account_key'])
    return unique_student

## 缺失的互动（Engagement）记录

In [9]:
#####################################
#                 4                 #
#####################################

## 找到任意一个 enrollments 中的学生，但不在 daily engagement 表中。
## 打印出这条 enrollments 记录。
u_enrollments=get_uniqe_student(enrollments)
u_daily=get_uniqe_student(daily_engagement)
for x in enrollments:
    if x[u'account_key'] not in u_daily:
        print x
        break


{u'status': u'canceled', u'is_udacity': False, u'is_canceled': True, u'join_date': datetime.datetime(2014, 11, 12, 0, 0), u'account_key': u'1219', u'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), u'days_to_cancel': 0}


## 检查更多的问题记录

In [10]:
#####################################
#                 5                 #
#####################################
## 计算无众不同的数据点条数（在 enrollments 中存在，但在 engagement 表中缺失）
# e_en=[]
# for m in enrollments:
#     if m[u'days_to_cancel']!=0:
#         e_en.append(m[u'account_key'])
# n=e_en-u_daily
# print n
# print len(n)
num_problem_students = 0
for enrollment in enrollments:
    student = enrollment[u'account_key']
    if (student not in u_daily and 
            enrollment[u'join_date'] != enrollment[u'cancel_date']):
        print enrollment
        num_problem_students += 1

num_problem_students

{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 1, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), u'days_to_cancel': 59}
{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 3, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), u'days_to_cancel': 99}
{u'status': u'current', u'is_udacity': True, u'is_canceled': False, u'join_date': datetime.datetime(2015, 2, 25, 0, 0), u'account_key': u'1101', u'cancel_date': None, u'days_to_cancel': None}


3

## 追踪剩余的问题

In [11]:
# 为所有 Udacity 测试帐号建立一组 set 
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [12]:
# 通过 account_key 删除所有 Udacity 的测试帐号
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [13]:
# 从3张表中移除所有 Udacity 的测试帐号
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)

1622
135656
3634


## 重新定义问题

In [20]:
#####################################
#                 6                 #
#####################################

## 创建一个叫 paid_students 的字典，并在字典中存储所有还没有取消或者注册时间超过7天的学生。
## 字典的键为帐号（account key），值为学生注册的时间。
paid_students={}
for data_point in non_udacity_enrollments:
    if not data_point['is_canceled'] or data_point['days_to_cancel']>7:
        paid_students[data_point['account_key']]=data_point['join_date']

print len(paid_students)
print paid_students

995
{u'1200': datetime.datetime(2015, 3, 4, 0, 0), u'1175': datetime.datetime(2015, 4, 2, 0, 0), u'1269': datetime.datetime(2015, 8, 21, 0, 0), u'669': datetime.datetime(2015, 5, 12, 0, 0), u'1257': datetime.datetime(2015, 7, 9, 0, 0), u'344': datetime.datetime(2015, 1, 11, 0, 0), u'345': datetime.datetime(2015, 1, 7, 0, 0), u'346': datetime.datetime(2014, 12, 8, 0, 0), u'347': datetime.datetime(2015, 4, 5, 0, 0), u'340': datetime.datetime(2015, 4, 1, 0, 0), u'341': datetime.datetime(2015, 5, 10, 0, 0), u'342': datetime.datetime(2014, 12, 5, 0, 0), u'343': datetime.datetime(2014, 12, 7, 0, 0), u'811': datetime.datetime(2015, 5, 6, 0, 0), u'812': datetime.datetime(2015, 7, 9, 0, 0), u'348': datetime.datetime(2015, 2, 5, 0, 0), u'349': datetime.datetime(2015, 4, 3, 0, 0), u'816': datetime.datetime(2015, 8, 20, 0, 0), u'918': datetime.datetime(2015, 4, 6, 0, 0), u'1149': datetime.datetime(2015, 4, 1, 0, 0), u'719': datetime.datetime(2014, 11, 17, 0, 0), u'718': datetime.datetime(2015, 5, 

## 获取第1周的数据
获取第1周的数据

In [18]:
# 基于学生的加入日期和特定一天的互动记录，若该互动记录发生在学生加入1周内，则反回 True

def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7

In [28]:
#####################################
#                 7                 #
#####################################

## 创建一个 engagement 记录的列表，该列表只包括付费学生以及加入的前7天的学生的记录
## 输入符合要求的行数
en_rows=[]
print non_udacity_engagement[0]
print paid_students.keys()
# for i in non_udacity_engagement:
#     if within_one_week(i['join_date',i['engagement_date']]):
#         en_rows.append(i['account_key'])
# print en_rows
# p_en_rows=[]
# for i in en_rows:
#     if i in paid_students:
#         p_en_rows.append(i)
        
# paid_engagement_in_first_week = 

{u'lessons_completed': 0, u'num_courses_visited': 1, u'total_minutes_visited': 11.6793745, u'projects_completed': 0, u'account_key': u'0', u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}
[u'1200', u'1175', u'1269', u'669', u'1257', u'344', u'345', u'346', u'347', u'340', u'341', u'342', u'343', u'811', u'812', u'348', u'349', u'816', u'918', u'1149', u'719', u'718', u'715', u'426', u'713', u'712', u'710', u'661', u'660', u'594', u'422', u'1060', u'1062', u'619', u'1066', u'427', u'298', u'299', u'296', u'297', u'294', u'295', u'292', u'293', u'290', u'291', u'1128', u'590', u'593', u'592', u'595', u'198', u'597', u'596', u'195', u'1121', u'197', u'1123', u'1124', u'1139', u'193', u'1088', u'270', u'271', u'272', u'273', u'274', u'275', u'276', u'277', u'278', u'279', u'497', u'1067', u'524', u'525', u'526', u'527', u'520', u'1019', u'522', u'523', u'1014', u'1015', u'528', u'1011', u'1013', u'1235', u'1236', u'1231', u'443', u'442', u'441', u'440', u'447', u'446', u'445', u'444', u'4

## Exploring Student Engagement

In [None]:
from collections import defaultdict

# 创建基于 student 对 engagement 进行分组的字典，字典的键为帐号（account key），值为包含互动记录的列表

engagement_by_account = defaultdict(list)
for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)

In [None]:
# 创建一个包含学生在第1周在教室所花总时间和字典。键为帐号（account key），值为数字（所花总时间）
total_minutes_by_account = {}
for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
    total_minutes_by_account[account_key] = total_minutes

In [None]:
import numpy as np

# 汇总和描述关于教室所花时间的数据
total_minutes = total_minutes_by_account.values()
print 'Mean:', np.mean(total_minutes)
print 'Standard deviation:', np.std(total_minutes)
print 'Minimum:', np.min(total_minutes)
print 'Maximum:', np.max(total_minutes)

## 纠错现有的数据分析代码

In [None]:
#####################################
#                 8                 #
#####################################

## 通过之前的方法检查是否有问题数据存在。
## 定位至少一条异常数据，打印出来并检查。

## 第1周完成的课程数（Lessons）

In [None]:
#####################################
#                 9                 #
#####################################

## 修改之前的代码，找出第1周学生完成课程数的 平均值、标准差、最小值、最大值。尝试创建一个或更多的函数来复用之前的代码


## 第1周的访问次数

In [None]:
######################################
#                 10                 #
######################################

## 找出第1周学生访问教室天数的平均值、标准差、最小值、最大值。

## 区分项目通过的学生

In [None]:
######################################
#                 11                 #
######################################

## 创建两个付费学生第1周的互动数据列表（engagement）。第1个包含通过项目的学生，第2个包含没通过项目的学生。

subway_project_lesson_keys = ['746169184', '3176718735']

passing_engagement =
non_passing_engagement =

## 对比两组学生数据

In [None]:
######################################
#                 12                 #
######################################

## 计算你所感兴趣的数据指标，并分析通过项目和没有通过项目的两组学生有何异同。
## 你可以从我们之前使用过的数据指标开始（教室的访问时间、课程完成数、访问天数）。


## 制作直方图

In [None]:
######################################
#                 13                 #
######################################

## 针对通过项目和没有通过项目的两组学生，为我们之前研究的三个数据指标制作直方图。
## 你也可以为其它你所检验的数据指标来制作直方图。

## 改进图表及分享发现

In [None]:
######################################
#                 14                 #
######################################

## 至少改进一幅之前的可视化图表，尝试导入 seaborn 库使你的图表看起来更美观。
## 加入轴标签及表头，并修改一个或多个 hist() 内的变量。