Importing Libraries

In [None]:
import pandas as pd
from datetime import datetime 
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np

Uploading dataset

In [None]:
enroll_file = pd.read_csv('enrollments.csv')
enroll_datafile = pd.DataFrame(data = enroll_file)
print(enroll_datafile.shape)
print(enroll_datafile.loc[enroll_datafile['account_key']==True])

engage_file = pd.read_csv('daily_engagement.csv')
engage_datafile = pd.DataFrame(data = engage_file)
print(engage_datafile.shape)

project_file = pd.read_csv('project_submissions.csv')
project_datafile = pd.DataFrame(data = project_file)
print(project_datafile.shape)

In [None]:
enroll_datafile.head()
engage_datafile.head()

Finding Unique values

In [None]:
unique_enroll = enroll_datafile.loc[(enroll_datafile['account_key'].unique())]
unique_project = project_datafile.loc[(project_datafile['account_key'].unique())]
unique_engage = engage_datafile.loc[(engage_datafile['account_key'].unique())]

print(len(unique_enroll))
print(len(unique_project))
print(len(unique_engage))
print(unique_engage)

In [None]:
enroll_datafile.rename({'acc':'account_key'},inplace = True)
enroll_datafile.head()

Missing engagement records

In [None]:
no=0
for element in enroll_datafile['account_key']:
    if element not in unique_engage:
        no+=1

print(no)

Checking for student stays in course

In [None]:
for index,rows in enroll_datafile.iterrows():
    ele = rows['account_key']
    if ele in unique_engage and rows['join_date']!=rows['cancel_date']:
        print(rows)

Refining the Students stays more than 7 days in the courses

In [None]:
paid_student = {}
cnt = 0
for index,rows in enroll_datafile.iterrows():
    if rows['days_to_cancel']>7 or not rows['is_canceled']:
        cnt+=1
        paid_student.update({rows['account_key']:rows['account_key']})

print(cnt)
len(paid_student)

In [None]:
paid_student = {}
for index, rows in enroll_datafile.iterrows():
    if pd.isnull(rows['days_to_cancel']) or rows['days_to_cancel']>7:
        paid_student.update({rows['account_key']:rows['join_date']})
len(paid_student)
        

Getting Data from First Week

In [None]:
def within_week(join_date,engage_date):
    x = datetime.strptime(engage_date,'%Y-%m-%d')
    y = datetime.strptime(join_date,'%Y-%m-%d')
    z = (x-y)
    if z.days<7 and z.days>=0:
        return True
    return False
paid_engagement = []

def first_week():
    for index,rows in engage_datafile.iterrows():
        ac = rows['account_key']
        if ac in paid_student.keys():
            join_date = paid_student[ac]
            engage_date = rows['utc_date']
            if within_week(join_date, engage_date):
                paid_engagement.append(rows)
    len(paid_engagement)

first_week()
len(paid_engagement)

Lessons Completed in First Week

In [None]:
def data_print(data):
    print('Mean : {}, Std : {}, Max : {}, Min : {}'.format(np.average(list(data.values())), np.std(list(data.values())), np.max(list(data.values())),np.min(list(data.values()))))


engage_account = defaultdict(list)
for student in paid_engagement:
    ac = student['account_key']
    engage_account[ac].append(student) 
len(engage_account)

total_min = {}
for key,value in engage_account.items():
    total = 0
    for ind in value:
        total += ind['total_minutes_visited']
    total_min[key] = total 
(total_min)
data_print(total_min) 

In [None]:
lesson_completed = {}
for key,value in engage_account.items():
    ls = 0
    for ind in value:       
        if ls < (ind['lessons_completed']):
            ls = (ind['lessons_completed'])
    lesson_completed[key] = ls
lesson_completed

Number of Visits in the First Week

In [None]:
has_visited = {}
for key,value in engage_account.items():
    cnt = 0
    for ind in value:      
        ind['has_visited'] = 0
        if ind['num_courses_visited'] > 0:
            ind['has_visited'] = 1
            cnt += ind['has_visited']
        has_visited[key]=cnt
print(len(has_visited))

data_print(has_visited)

Splitting out Passing Students

In [None]:
subway_key = [746169184,3176718735]
submission_value = ['PASSED','DISTINCTION']

passed_project = set()
non_passed_project = set()
for index, row in project_datafile.iterrows():
    lesn = row['lesson_key']
    rating = row['assigned_rating']
    if rating in submission_value and lesn in subway_key:
            passed_project.add(row['account_key'])
    else:
        non_passed_project.add(row['account_key'])
print(len(passed_project))
print(len(non_passed_project))
print(len(project_datafile))

Comparing the Two Student Groups

In [None]:
passed_engagement = []
non_passed_engagement = []

for rows in paid_engagement:
    if rows['account_key'] in passed_project:
        passed_engagement.append(rows['account_key'])
        
    else:
        non_passed_engagement.append(rows['account_key'])
    
print(len(passed_engagement))
print(len(non_passed_engagement))

print(len(paid_engagement))


In [None]:
def all_calculte(data,point):
    passed_min_id = {}
    for key,value in engage_account.items():
        ac = key
        if ac in data:
            passed_minute = []
            for row in value:
                passed_minute.append(row[point])
            passed_min_id[ac] = np.average(passed_minute)
#             print(str(key)+":"+str(passed_minute))
    return passed_min_id

passed_minute = all_calculte(passed_project,'total_minutes_visited')
passed_lesson = all_calculte(passed_project,'lessons_completed')
passed_course_visited = all_calculte(passed_project,'num_courses_visited')

non_passed_minute = all_calculte(non_passed_project,'total_minutes_visited')
non_passed_lesson = all_calculte(non_passed_project,'lessons_completed')
non_passed_course_visited = all_calculte(non_passed_project,'lessons_completed')

Making Histograms

In [None]:
plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Average minutes for passed students")
plt.hist(list(passed_minute.values()))
plt.show()

plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Number of lesson completed by passed students")
plt.hist(list(passed_lesson.values()))
plt.show()

plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Number of Courses Visited for passed students")
plt.hist(list(passed_course_visited.values()))
plt.show()

In [None]:
plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Average minutes for non-passed students")
plt.hist(list(non_passed_minute.values()))
plt.show()

plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Number of lesson completed by non-passed students")
plt.hist(list(non_passed_lesson.values()))
plt.show()

plt.xlabel("Label for x axis")
plt.ylabel("Label for y axis")
plt.title("Number of Courses Visited for non-passed students")
plt.hist(list(non_passed_course_visited.values()))
plt.show()

In [None]:
print(np.max(list(passed_minute.values())))