In [None]:
%pylab inline
import pandas
import seaborn

# Load CSV file into memory

In [None]:
data = pandas.read_csv('./data/uber-raw-data-apr14.csv')

In [None]:
data.tail()

# Convert datetime and add some useful columns

In [None]:
%time data['Date/Time'] = data['Date/Time'].map(pandas.to_datetime)

In [None]:
data.tail()

In [None]:
def get_dom(dt):
    return dt.day

data['dom'] = data['Date/Time'].map(get_dom)
    

In [None]:
data.tail()

In [None]:
def get_weekday(dt):
    return dt.weekday()

data['weekday'] = data['Date/Time'].map(get_weekday)

def get_hour(dt):
    return dt.hour

data['hour'] = data['Date/Time'].map(get_hour)

data.tail()

# analysis

## analyze the DoM

In [None]:
hist(data.dom, bins=30, rwidth=.8, range=(0.5, 30.5))
xlabel('date of the month')
ylabel('frequency')
title('Frequency by DoM - uber - Apr 2014') ;

In [None]:

#for k, rows in data.groupby('dom'):
#    print((k, len(rows)))
 
def count_rows(rows):
    return len(rows)

by_date = data.groupby('dom').apply(count_rows)
by_date

In [None]:
bar(range(1, 31), by_date) ;
#plot(by_date)

In [None]:
by_date_sorted = by_date.sort_values()
by_date_sorted

In [None]:
figure(figsize=(15,7))
bar(range(1, 31), by_date_sorted)
#plot(by_date_sorted)
xticks(range(1,31), by_date_sorted.index)
xlabel('date of the month')
ylabel('frequency')
title('Frequency by DoM - uber - Apr 2014')
;

## analyze the hour

In [None]:
figure(figsize=(15,7))
hist(data.hour, bins=24, range=(-0.5, 24), rwidth=.89) 
xticks(range(0,25), data.hour.index)
xlabel('hour')
ylabel('frequency')
title('Frequency by hours- uber - Apr 2014');

# analyze the weekday


In [None]:
figure(figsize=(15,7))
hist(data.weekday, bins=7, range =(-.5,6.5), rwidth=.8, color='#AA6666', alpha=.4)
xticks(range(7), 'Mon Tue Wed Thu Fri Sat Sun'.split()) ;

# cross analysis (hour, dow)

In [None]:

by_cross = data.groupby('weekday hour'.split()).apply(count_rows).unstack() ;

In [None]:
figure(figsize=(12,8))
seaborn.heatmap(by_cross) 
xlabel('hour')
ylabel('Day')
title('Frequency by hours- uber - Apr 2014');

# by lat and lon

In [None]:
figure(figsize=(15,7))
hist(data['Lat'], bins=100, range = (40.5, 41));

In [None]:
figure(figsize=(15,7))
hist(data['Lon'], bins=100, range = (-74.1, -73.9));

In [None]:
figure(figsize=(15,7))
hist(data['Lon'], bins=100, range = (-74.1, -73.9), color='g', alpha=.5, label = 'longitude', rwidth=.85)
grid()
legend(loc='upper left', fontsize='18')
xlabel('Geo-position',fontsize = 20)
ylabel('Frequency',fontsize = 20)
twiny()
grid()
hist(data['Lat'], bins=100, range = (40.5, 41), color='r', alpha=.5, label = 'latitude', rwidth=.85)
legend(loc='best',fontsize='18') 
xlabel('Geo-position',fontsize = 20);

In [None]:
figure(figsize=(15, 15))
plot(data['Lon'], data['Lat'], '.', ms=2, alpha=.09, color = 'black', label='')
#plot(-74.034991,40.739840,'*',color = 'red', ms=10)
plot(-73.973657,40.762383,'*',color = 'red',ms=15, label = 'Trump Tower')
grid()
xlim(-74.10, -73.85)
ylim(40.6, 40.9)
legend(loc='upper left',fontsize='18') ;

In [None]:
figure(figsize=(8,3))
hist(data.hour, bins=24, range=(-0.5, 24), rwidth=.89) 
xticks(range(0,25), data.hour.index)
xlabel('hour')
ylabel('frequency')
title('Frequency by hours- uber - Apr 2014');

In [None]:
lista_horas = [0,7,10,13,15,18,20,2,17]

spt = 1.4
alfa = .4
colorin = 'red'
x_min = -74.10
x_max = -73.85
y_min = 40.6
y_max = 40.9

figure(figsize=(30, 30))
for i in range(len(lista_horas)):
    plt.subplot(3, 3, (i+1))
    h=lista_horas[i]
    df_hour = data[data.hour == h]
    plot(df_hour['Lon'], df_hour['Lat'], '.', ms=spt, alpha=alfa, color = colorin, label = 'hour = '+str(h))
    xlim(x_min, x_max)
    ylim(y_min, y_max)
    grid()
    legend(loc='upper left',fontsize='32') ;
    