In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

## Read csv files using pandas

In [2]:
daily_engagement = pd.read_csv("../data/daily_engagement_full.csv")

In [6]:
type(daily_engagement)

pandas.core.frame.DataFrame

In [7]:
daily_engagement

Unnamed: 0,utc_date,acct,registration_date,subscription_start,course_key,sibling_key,course_title,has_visited,total_minutes_visited,lessons_completed,projects_completed,account_key
0,2014-11-05,448,2014-08-05,2014-11-05,ud359-nd,ud359,Intro to Data Science,0.0,0.0,0.0,0.0,2257038596
1,2014-11-05,448,2014-08-05,2014-11-05,ud120-nd,ud120,Intro to Machine Learning,0.0,0.0,0.0,0.0,2257038596
2,2014-11-05,448,2014-08-05,2014-11-05,ud651-nd,ud651,Data Analysis with R,0.0,0.0,0.0,0.0,2257038596
3,2014-11-05,448,2014-08-05,2014-11-05,ud507-nd,ud507,Data Visualization and D3.js,0.0,0.0,0.0,0.0,2257038596
4,2014-11-05,448,2014-08-05,2014-11-05,ud651,ud651,Data Analysis with R,0.0,0.0,0.0,0.0,2257038596
...,...,...,...,...,...,...,...,...,...,...,...,...
2309234,2015-08-26,854,2012-10-12,2015-08-23,ud359,ud359,Intro to Data Science,0.0,0.0,0.0,0.0,u26334020
2309235,2015-08-26,854,2012-10-12,2015-08-23,ud804,ud804,JavaScript Basics,0.0,0.0,0.0,0.0,u26334020
2309236,2015-08-26,854,2012-10-12,2015-08-23,ud651,ud651,Data Analysis with R,0.0,0.0,0.0,0.0,u26334020
2309237,2015-08-26,854,2012-10-12,2015-08-23,ud304,ud304,Intro to HTML and CSS,0.0,0.0,0.0,0.0,u26334020


## pandas DataFrame.unique()

In [8]:
len(daily_engagement['acct'].unique())


1237

## numpy array vectorized operations

In [9]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

In [11]:
# Accessing elements

print(countries[0])
print(countries[3])

Afghanistan
Angola


In [12]:
# Slicing

print(countries[0:3])
print(countries[:3])
print(countries[17:])
print(countries[:])

['Afghanistan' 'Albania' 'Algeria']
['Afghanistan' 'Albania' 'Algeria']
['Bhutan' 'Bolivia' 'Bosnia and Herzegovina']
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina']


In [13]:
# Element types

print(countries.dtype)
print(employment.dtype)
print(np.array([0, 1, 2, 3]).dtype)
print(np.array([1.0, 1.5, 2.0, 2.5]).dtype)
print(np.array([True, False, True]).dtype)
print(np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype)

<U22
float64
int32
float64
bool
<U2


In [14]:
# Looping

for country in countries:
    print('Examining country {}'.format(country))

for i in range(len(countries)):
    country = countries[i]
    country_employment = employment[i]
    print('Country {} has employment {}'.format(country,country_employment))

Examining country Afghanistan
Examining country Albania
Examining country Algeria
Examining country Angola
Examining country Argentina
Examining country Armenia
Examining country Australia
Examining country Austria
Examining country Azerbaijan
Examining country Bahamas
Examining country Bahrain
Examining country Bangladesh
Examining country Barbados
Examining country Belarus
Examining country Belgium
Examining country Belize
Examining country Benin
Examining country Bhutan
Examining country Bolivia
Examining country Bosnia and Herzegovina
Country Afghanistan has employment 55.70000076
Country Albania has employment 51.40000153
Country Algeria has employment 50.5
Country Angola has employment 75.69999695
Country Argentina has employment 58.40000153
Country Armenia has employment 40.09999847
Country Australia has employment 61.5
Country Austria has employment 57.09999847
Country Azerbaijan has employment 60.90000153
Country Bahamas has employment 66.59999847
Country Bahrain has employmen

In [15]:
# Numpy functions

print(employment.mean())
print(employment.std())
print(employment.max())
print(employment.sum())

58.68500003850001
9.338269113687888
75.69999695
1173.70000077


In [17]:
def max_employment(countries, employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    '''
    i = employment.argmax()
    return countries[i], employment[i]

print(max_employment(countries, employment))

('Angola', 75.69999695)


In [18]:
# Arithmetic operations between 2 NumPy arrays

a = np.array([1, 2, 3, 4])
b = np.array([1, 2, 1, 2])
    
print(a + b)
print(a - b)
print(a * b)
print(a / b)
print(a ** b)
    

[2 4 4 6]
[0 0 2 2]
[1 4 3 8]
[1. 1. 3. 2.]
[ 1  4  3 16]


In [19]:
# Arithmetic operations between a NumPy array and a single number

a = np.array([1, 2, 3, 4])
b = 2
    
print(a + b)
print(a ** b)    

[3 4 5 6]
[ 1  4  9 16]


In [20]:
# Logical operations with NumPy arrays

a = np.array([True, True, False, False])
b = np.array([True, False, True, False])
    
print(a & b)
print(a | b)
print(~a)

[ True False False False]
[ True  True  True False]
[False False  True  True]


In [21]:
# Comparison operations between 2 NumPy Arrays

a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])
    
print(a > b)
print(a == b)
print(a != b)

[False False False  True  True]
[False False  True False False]
[ True  True False  True  True]


In [22]:
# Comparison operations between a NumPy array and a single number

a = np.array([1, 2, 3, 4])
b = 2
    
print(a > b)
print(a == b)
print(a != b)

[False False  True  True]
[False  True False False]
[ True False  True  True]


In [25]:
# First 20 countries with school completion data
countries = np.array([
       'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia',
       'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Cape Verde'
])

# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])

def overall_completion_rate(female_completion, male_completion):
    '''
    Fill in this function to return a NumPy array containing the overall
    school completion rate for each country. The arguments are NumPy
    arrays giving the female and male completion of each country in
    the same order.
    '''
    return (female_completion + male_completion) / 2.0

overall_completion_rate(female_completion, male_completion)

array([ 96.416025, 102.644275, 101.41129 ,  93.316285, 103.455575,
        98.148215, 102.35113 ,  91.77855 ,  92.835475,  89.655755,
        99.218715,  98.484275,  94.172835, 117.335125,  98.275645,
        33.04039 ,  41.905225,  90.962965,  57.08404 ,  93.06015 ])

In [26]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

country_name = 'United States'

from scipy import stats

def standardize_data(values):
    '''
    Fill in this function to return a standardized version of the given values,
    which will be in a NumPy array. Each value should be translated into the
    number of standard deviations that value is away from the mean of the data.
    (A positive number indicates a value higher than the mean, and a negative
    number indicates a value lower than the mean.)
    '''
    return stats.zscore(values)

standardize_data(employment)

array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,
       -1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,
        0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,
       -0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

## numpy index arrays

In [27]:
# Using index arrays

a = np.array([1, 2, 3, 4])
b = np.array([True, True, False, False])
    
print(a[b])
print(a[np.array([True, False, True, False])])

[1 2]
[1 3]


In [29]:
# Creating the index array using vectorized operations

a = np.array([1, 2, 3, 2, 1])
b = (a >= 2)
    
print(a[b])
print(a[a >= 2])

[2 3 2]
[2 3 2]


In [31]:
# Creating the index array using vectorized operations on another array

a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 2, 1])
    
print(a[b == 2])

[2 4]


In [32]:
def mean_time_for_paid_students(time_spent, days_to_cancel):
    '''
    Fill in this function to calculate the mean time spent in the classroom
    for students who stayed enrolled at least (greater than or equal to) 7 days.
    Unlike in Lesson 1, you can assume that days_to_cancel will contain only
    integers (there are no students who have not canceled yet).
    
    The arguments are NumPy arrays. time_spent contains the amount of time spent
    in the classroom for each student, and days_to_cancel contains the number
    of days until each student cancel. The data is given in the same order
    in both arrays.
    '''
    return time_spent[days_to_cancel >= 7].mean()

# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
       12.89697233,    0.        ,   64.55043217,    0.        ,
       24.2315615 ,   39.991625  ,    0.        ,    0.        ,
      147.20683783,    0.        ,    0.        ,    0.        ,
       45.18261617,  157.60454283,  133.2434615 ,   52.85000767,
        0.        ,   54.9204785 ,   26.78142417,    0.
])

# Days to cancel for 20 students
days_to_cancel = np.array([
      4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,
     38,  98,   2, 249,   2, 127,  35
])

## numpy arrays - not in place vectorized operations

In [35]:
a = np.array([1, 2, 3, 4])
b = a
a = a + np.array([1, 1, 1, 1])  # a points to the new array created by addition, b points to original a
b

array([1, 2, 3, 4])

## numpy arrays - in place vectorized operations

In [5]:
a = np.array([1, 2, 3, 4])
b = a
a += np.array([1, 1, 1, 1])  # a is modified in place, so b points to the updated a
b

array([2, 3, 4, 5])

## numpy arrays - slices are views of the array they refer to

In [6]:
a = np.array([1, 2, 3, 4])

slice = a[:2]  # just a view of the array
slice[0] = 100
a

array([100,   2,   3,   4])