# My DataQuest Learning Report 

In [367]:
import pandas as pd
import matplotlib.pyplot as plt
import mailbox
%matplotlib inline

## Import downloaded emails, parse & format

In [368]:
# Import and parse gmail content with mailbox
mb = mailbox.mbox('dataquest learning curve.mbox')

In [369]:
# Get email received dates and content
message_timestamp = []
content = []

for message in mb.itervalues():
    # Get email dates 
    dmessage = dict(message.items())
    message_timestamp.append(dmessage['Date'])
    
    # Parse email content
    if message.is_multipart():
        c = b''.join(part.get_payload(decode=True) for part in message.get_payload()) # b for byte object
    else:
        c = message.get_payload(decode=True)
    content.append(c.decode("utf-8")) # decode byte object to string

In [370]:
message_timestamp[:3]

['Mon, 29 Jun 2020 13:10:01 +0000',
 'Mon, 07 Sep 2020 13:10:02 +0000',
 'Mon, 09 Nov 2020 14:10:12 +0000']

In [371]:
import datetime as dt

# Format date to only %d, %m, %Y
clean_date = []
f = '%a, %d %b %Y %H:%M:%S +%f'

for timestamp in message_timestamp:
    date = dt.datetime.strptime(timestamp, f).strftime('%d, %m, %Y')
    clean_date.append(date)

In [372]:
clean_date

['29, 06, 2020',
 '07, 09, 2020',
 '09, 11, 2020',
 '06, 07, 2020',
 '12, 10, 2020',
 '05, 10, 2020',
 '22, 06, 2020',
 '02, 11, 2020',
 '10, 08, 2020',
 '30, 11, 2020',
 '17, 08, 2020',
 '13, 07, 2020',
 '26, 10, 2020',
 '23, 11, 2020',
 '20, 07, 2020',
 '27, 07, 2020']

In [373]:
# Check out email content format 
content[0]

'\n\nYour Weekly Activity Summary\n\nHello there, Xuewei!\n\nCongratulations on a productive week! Here\'s a review of what you accomplished \nlast week:\n\n\n\nMissions Completed\n\n10\n\n\n\n+233%\n\n\n\nMinutes spent learning\n\n1,231\n\n\n\n+492%\n\n\n\nCurrent learning streak\n\n\n\nYour best streak yet!\n\n10 days\n\n\n\n\n\nMissions to your next certificate\n\n1\n\n\n\n\n\nReady to get a jump start on your progress this week? Dive in now and keep \nlearning!\nKeep Learning \nAnd remember, if you need any help or support, the Dataquest Community is here \nfor you <https://community.dataquest.io/>.\n\n\xa0\n\nHappy learning,\n\nThe Dataquest Team\n\n\n <https://www.linkedin.com/school/dataquest-io/>  \n<https://www.facebook.com/dataquestio/>  <https://twitter.com/dataquestio>  \n<https://www.instagram.com/dataquestio/> \nThis email has been sent to veratsien@gmail.com. Click here to unsubscribe \n<http://links.iterable.com/e/encryptedUnsubscribe?_r=4a16705ff7784260b1b139ad7ac93ebf

In [374]:
import re

clean_content = []

# Remove html from content
for c in content:
    no_html = re.split(r'<!DOCTYPE', c)[0]
    
    # Split content by \n 
    split_by_line = re.split(r'\n', no_html)
    
    # Remove empty strings 
    clean = list(filter(lambda x: x!='', split_by_line))
    clean_content.append(clean)

In [375]:
clean_content[:2]

[['Your Weekly Activity Summary',
  'Hello there, Xuewei!',
  "Congratulations on a productive week! Here's a review of what you accomplished ",
  'last week:',
  'Missions Completed',
  '10',
  '+233%',
  'Minutes spent learning',
  '1,231',
  '+492%',
  'Current learning streak',
  'Your best streak yet!',
  '10 days',
  'Missions to your next certificate',
  '1',
  'Ready to get a jump start on your progress this week? Dive in now and keep ',
  'learning!',
  'Keep Learning ',
  'And remember, if you need any help or support, the Dataquest Community is here ',
  'for you <https://community.dataquest.io/>.',
  '\xa0',
  'Happy learning,',
  'The Dataquest Team',
  ' <https://www.linkedin.com/school/dataquest-io/>  ',
  '<https://www.facebook.com/dataquestio/>  <https://twitter.com/dataquestio>  ',
  '<https://www.instagram.com/dataquestio/> ',
  'This email has been sent to veratsien@gmail.com. Click here to unsubscribe ',
  '<http://links.iterable.com/e/encryptedUnsubscribe?_r=4a167

In [376]:
# Extract useful information from clean content 
missions_completed = []
missions_increase_pct = []
minutes_spent = []
minutes_increase_pct = []
learning_streak = []

for text in clean_content:
    missions_idx = text.index('Missions Completed')
    missions_num = text[missions_idx + 1]
    missions_pct = text[missions_idx + 2]
    
    minutes_idx = text.index('Minutes spent learning')
    minutes = text[minutes_idx+1]
    minutes_pct = text[minutes_idx+2]
    
    streak_idx = text.index('Current learning streak')
    streak = text[streak_idx+2].split()[0]
    
    missions_completed.append(missions_num)
    missions_increase_pct.append(missions_pct)
    
    minutes_spent.append(minutes)
    minutes_increase_pct.append(minutes_pct)
    
    learning_streak.append(streak)

In [377]:
# Santiy check 
print(len(missions_completed),
      len(missions_increase_pct),
      len(minutes_spent),
      len(minutes_increase_pct),
      len(learning_streak))

16 16 16 16 16


In [378]:
# Create a dataframe for email dates and content 
learning_progress = pd.DataFrame(data = {'date': clean_date,
                                      'missions_completed': missions_completed,
                                      'missions_increase_pct':missions_increase_pct,
                                      'minutes_spent':minutes_spent,
                                      'minutes_increase_pct':minutes_increase_pct,
                                      'learning_streak(days)':learning_streak})

In [387]:
learning_progress.head()

Unnamed: 0,date,missions_completed,missions_increase_pct,minutes_spent,minutes_increase_pct,learning_streak(days)
0,2020-06-22,5,100,277,100,3
1,2020-06-29,10,233,1231,492,10
2,2020-07-06,7,-12,646,-43,3
3,2020-07-13,7,17,843,58,10
4,2020-07-20,6,-14,1133,48,6


In [380]:
# Convert strings to proper dtypes 
learning_progress.date = pd.to_datetime(learning_progress.date, dayfirst = True)

learning_progress.missions_increase_pct = learning_progress.missions_increase_pct.str.replace('%', '')
learning_progress.minutes_increase_pct = learning_progress.minutes_increase_pct.str.replace('%', '')
learning_progress.minutes_spent = learning_progress.minutes_spent.str.replace('\D', '')

int_cols = learning_progress.columns[1:]
learning_progress[int_cols] = learning_progress[int_cols].astype(int)

In [381]:
learning_progress.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   16 non-null     datetime64[ns]
 1   missions_completed     16 non-null     int64         
 2   missions_increase_pct  16 non-null     int64         
 3   minutes_spent          16 non-null     int64         
 4   minutes_increase_pct   16 non-null     int64         
 5   learning_streak(days)  16 non-null     int64         
dtypes: datetime64[ns](1), int64(5)
memory usage: 896.0 bytes


In [385]:
learning_progress.sort_values(by = 'date', ignore_index = True, inplace = True)

In [388]:
learning_progress

Unnamed: 0,date,missions_completed,missions_increase_pct,minutes_spent,minutes_increase_pct,learning_streak(days)
0,2020-06-22,5,100,277,100,3
1,2020-06-29,10,233,1231,492,10
2,2020-07-06,7,-12,646,-43,3
3,2020-07-13,7,17,843,58,10
4,2020-07-20,6,-14,1133,48,6
5,2020-07-27,16,220,1369,45,6
6,2020-08-10,10,-9,1017,34,7
7,2020-08-17,9,29,837,-6,14
8,2020-09-07,5,0,783,25,7
9,2020-10-05,9,29,1058,32,3


## Analysis

In [383]:
learning_progress.missions_completed.sum()

142