In [1]:
import numpy as np
import pandas as pd
import seaborn as sbs
from matplotlib import pyplot as plt

## Numpy ndarrays

In [2]:
# Subway ridership for 5 stations on 10 different days
ridership = np.array([
    [   0,    0,    2,    5,    0],
    [1478, 3877, 3674, 2328, 2539],
    [1613, 4088, 3991, 6461, 2691],
    [1560, 3392, 3826, 4787, 2613],
    [1608, 4802, 3932, 4477, 2705],
    [1576, 3933, 3909, 4979, 2685],
    [  95,  229,  255,  496,  201],
    [   2,    0,    1,   27,    0],
    [1438, 3785, 3589, 4174, 2215],
    [1342, 4043, 4009, 4665, 3033]
])

In [4]:
# Accessing elements

print(ridership[1, 3])
print()
print(ridership[1:3, 3:5])
print()
print(ridership[1, :])
    

2328

[[2328 2539]
 [6461 2691]]

[1478 3877 3674 2328 2539]


In [6]:
# Vectorized operations on rows or columns

print(ridership[0, :] + ridership[1, :])
print()
print(ridership[:, 0] + ridership[:, 1])


[1478 3877 3676 2333 2539]

[   0 5355 5701 4952 6410 5509  324    2 5223 5385]


In [7]:
# Vectorized operations on entire arrays

a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])

print(a + b)


[[ 2  3  4]
 [ 6  7  8]
 [10 11 12]]


In [8]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    Hint: NumPy's argmax() function might be useful:
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.argmax.html
    '''
    overall_mean = ridership.mean()
    station_id = ridership[0, :].argmax()
    mean_for_max = ridership[:, station_id].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership)

(2342.6, 3239.9)

## Numpy axis

In [9]:
# NumPy axis argument

a = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
    ])

# numpy arrays are indexed in row-major order, i.e. rows are axis 0, columns axis 1
    
print(a.sum())
print()
print(a.sum(axis=0))
print()
print(a.sum(axis=1))


45

[12 15 18]

[ 6 15 24]


In [10]:
def min_and_max_riders_per_day(ridership):
    '''
    Fill in this function. First, for each subway station, calculate the
    mean ridership per day. Then, out of all the subway stations, return the
    maximum and minimum of these values. That is, find the maximum
    mean-ridership-per-day and the minimum mean-ridership-per-day for any
    subway station.
    '''
    mean_ridership_per_station = ridership.mean(axis=0)
    max_daily_ridership = mean_ridership_per_station.max()
    min_daily_ridership = mean_ridership_per_station.min()
    
    return (max_daily_ridership, min_daily_ridership)

min_and_max_riders_per_day(ridership)

(3239.9, 1071.2)

## NumPy and Pandas Data Types

In [14]:
# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

ridership_df


Unnamed: 0,R003,R004,R005,R006,R007
05-01-11,0,0,2,5,0
05-02-11,1478,3877,3674,2328,2539
05-03-11,1613,4088,3991,6461,2691
05-04-11,1560,3392,3826,4787,2613
05-05-11,1608,4802,3932,4477,2705
05-06-11,1576,3933,3909,4979,2685
05-07-11,95,229,255,496,201
05-08-11,2,0,1,27,0
05-09-11,1438,3785,3589,4174,2215
05-10-11,1342,4043,4009,4665,3033


In [15]:
# DataFrame creation

# You can create a DataFrame out of a dictionary mapping column names to values
df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})

print(df_1)

   A  B
0  0  3
1  1  4
2  2  5


In [16]:
# You can also use a list of lists or a 2D NumPy array
df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])

print(df_2)
   

   A  B  C
0  0  1  2
1  3  4  5


In [17]:
# Accessing elements

print(ridership_df.iloc[0])
print()
print(ridership_df.loc['05-05-11'])
print()
print(ridership_df['R003'])
print()
print(ridership_df.iloc[1, 3])
print()


R003    0
R004    0
R005    2
R006    5
R007    0
Name: 05-01-11, dtype: int64

R003    1608
R004    4802
R005    3932
R006    4477
R007    2705
Name: 05-05-11, dtype: int64

05-01-11       0
05-02-11    1478
05-03-11    1613
05-04-11    1560
05-05-11    1608
05-06-11    1576
05-07-11      95
05-08-11       2
05-09-11    1438
05-10-11    1342
Name: R003, dtype: int64

2328



In [18]:
# Accessing multiple rows

print(ridership_df.iloc[1: 4])
    

          R003  R004  R005  R006  R007
05-02-11  1478  3877  3674  2328  2539
05-03-11  1613  4088  3991  6461  2691
05-04-11  1560  3392  3826  4787  2613


In [22]:
# Accessing multiple columns

print(ridership_df[['R003', 'R005']])
    

          R003  R005
05-01-11     0     2
05-02-11  1478  3674
05-03-11  1613  3991
05-04-11  1560  3826
05-05-11  1608  3932
05-06-11  1576  3909
05-07-11    95   255
05-08-11     2     1
05-09-11  1438  3589
05-10-11  1342  4009


In [23]:
# Pandas axis

df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})

print(df.sum())
print()
print(df.sum(axis=1))
print()
print(df.values.sum()) # here df.values returns a 2D numpy array
    

A     3
B    12
dtype: int64

0    3
1    5
2    7
dtype: int64

15


In [24]:
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    overall_mean = ridership.values.mean()
    station_id = ridership.iloc[0].idxmax()
    mean_for_max = ridership[station_id].mean()
    
    return (overall_mean, mean_for_max)

mean_riders_for_max_station(ridership_df)

(2342.6, 3239.9)

## Loading data into a DataFrame

In [25]:
filename = "../data/nyc_subway_weather.csv"
subway_df = pd.read_csv(filename)

subway_df.head()  # get the first 5 rows

Unnamed: 0,UNIT,DATEn,TIMEn,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,datetime,hour,day_week,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
0,R003,05-01-11,00:00:00,4388333,2911002,0.0,0.0,2011-05-01 00:00:00,0,6,...,30.22,0,55.9,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
1,R003,05-01-11,04:00:00,4388333,2911002,0.0,0.0,2011-05-01 04:00:00,4,6,...,30.25,0,52.0,3.5,0.0,30.258,55.98,7.86,40.700348,-73.887177
2,R003,05-01-11,12:00:00,4388333,2911002,0.0,0.0,2011-05-01 12:00:00,12,6,...,30.28,0,62.1,6.9,0.0,30.258,55.98,7.86,40.700348,-73.887177
3,R003,05-01-11,16:00:00,4388333,2911002,0.0,0.0,2011-05-01 16:00:00,16,6,...,30.26,0,57.9,15.0,0.0,30.258,55.98,7.86,40.700348,-73.887177
4,R003,05-01-11,20:00:00,4388333,2911002,0.0,0.0,2011-05-01 20:00:00,20,6,...,30.28,0,52.0,10.4,0.0,30.258,55.98,7.86,40.700348,-73.887177


In [26]:
# get the stats per column

subway_df.describe()

Unnamed: 0,ENTRIESn,EXITSn,ENTRIESn_hourly,EXITSn_hourly,hour,day_week,weekday,latitude,longitude,fog,...,pressurei,rain,tempi,wspdi,meanprecipi,meanpressurei,meantempi,meanwspdi,weather_lat,weather_lon
count,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,...,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0,42649.0
mean,28124860.0,19869930.0,1886.589955,1361.487866,10.046754,2.905719,0.714436,40.724647,-73.940364,0.009824,...,29.971096,0.224741,63.10378,6.927872,0.004618,29.971096,63.10378,6.927872,40.728555,-73.938693
std,30436070.0,20289860.0,2952.385585,2183.845409,6.938928,2.079231,0.451688,0.07165,0.059713,0.098631,...,0.137942,0.417417,8.455597,4.510178,0.016344,0.131158,6.939011,3.179832,0.06542,0.059582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.576152,-74.073622,0.0,...,29.55,0.0,46.9,0.0,0.0,29.59,49.4,0.0,40.600204,-74.01487
25%,10397620.0,7613712.0,274.0,237.0,4.0,1.0,0.0,40.677107,-73.987342,0.0,...,29.89,0.0,57.0,4.6,0.0,29.913333,58.283333,4.816667,40.688591,-73.98513
50%,18183890.0,13316090.0,905.0,664.0,12.0,3.0,1.0,40.717241,-73.953459,0.0,...,29.96,0.0,61.0,6.9,0.0,29.958,60.95,6.166667,40.72057,-73.94915
75%,32630490.0,23937710.0,2255.0,1537.0,16.0,5.0,1.0,40.759123,-73.907733,0.0,...,30.06,0.0,69.1,9.2,0.0,30.06,67.466667,8.85,40.755226,-73.912033
max,235774600.0,149378200.0,32814.0,34828.0,20.0,6.0,1.0,40.889185,-73.755383,1.0,...,30.32,1.0,86.0,23.0,0.1575,30.293333,79.8,17.083333,40.862064,-73.694176


## Calculating correlation using Pearson's r coefficient

In [27]:
def correlation(x, y):
    '''
    Fill in this function to compute the correlation between the two
    input variables. Each input is either a NumPy array or a Pandas
    Series.
    
    correlation = average of (x in standard units) times (y in standard units)
    
    Remember to pass the argument "ddof=0" to the Pandas std() function!
    '''
    x = x - x.mean()
    y = y - y.mean()
    zscores_x = x / x.std(ddof=0)
    zscores_y = y / y.std(ddof=0)
    return (zscores_x * zscores_y).mean()


entries = subway_df['ENTRIESn_hourly']
cum_entries = subway_df['ENTRIESn']
rain = subway_df['meanprecipi']
temp = subway_df['meantempi']

print(correlation(entries, rain))
print(correlation(entries, temp))
print(correlation(rain, temp))
print(correlation(entries, cum_entries))

0.035648515772230396
-0.026693348321570484
-0.22903432340834323
0.5858954707662166


## Using numpy.corrcoef for Pearson's r coefficient

In [28]:
def correlation_numpy(x, y):
    return np.corrcoef(x, y)[0, 1]


print(correlation_numpy(entries, rain))
print(correlation_numpy(entries, temp))
print(correlation_numpy(rain, temp))
print(correlation_numpy(entries, cum_entries))

0.03564851577224401
-0.026693348321570824
-0.2290343234084345
0.585895470766208


## Pandas axis names

In [29]:
subway_df.mean(axis='index')

ENTRIESn           2.812486e+07
EXITSn             1.986993e+07
ENTRIESn_hourly    1.886590e+03
EXITSn_hourly      1.361488e+03
hour               1.004675e+01
day_week           2.905719e+00
weekday            7.144364e-01
latitude           4.072465e+01
longitude         -7.394036e+01
fog                9.824380e-03
precipi            4.617693e-03
pressurei          2.997110e+01
rain               2.247415e-01
tempi              6.310378e+01
wspdi              6.927872e+00
meanprecipi        4.617693e-03
meanpressurei      2.997110e+01
meantempi          6.310378e+01
meanwspdi          6.927872e+00
weather_lat        4.072856e+01
weather_lon       -7.393869e+01
dtype: float64

In [30]:
subway_df.mean(axis='columns')

0        347593.254693
1        347593.260883
2        347594.286122
3        347594.661360
4        347594.352788
             ...      
42644     33346.539255
42645     16701.355921
42646     24540.756398
42647     24540.916874
42648     24540.293540
Length: 42649, dtype: float64