# Data Analysis Examples

Now that we've reached th end of the book's main chapters, we're going to take a look at a number of real-world datasets. For each dataset, we'll use the techniques presented in this book to extract  meaning from the raw data. The demonstrated techniques can be applied to all manner of the other datasets, including your own. This chapter contains a collection of miscellaneous example datasets that you can use for practice with the tools ion this book.
The examples datasets are found in the book's accompanying GitHub repository (http://github.com/wesm/pydata-book)

## USA.gov Data from Bitly

In [None]:
path = '../datasets/bitly_usagov/example.txt'

In [None]:
open(path).readline()

In [None]:
import json
path = '../datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path)]

In [None]:
records[0]

### Counting Time Zones in Pure Python

In [None]:
# time_zones = [rec['tz'] for rec in records]

In [None]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

In [None]:
time_zones[:10]

In [None]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else: 
            counts[x] = 1
    return counts

In [None]:
from collections import defaultdict

In [None]:
def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence: 
        counts[x] += 1
    return counts

In [None]:
counts = get_counts(time_zones)

In [None]:
counts['America/New_York']

In [None]:
len(time_zones)

In [None]:
def top_counts(count_dict,n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

In [None]:
top_counts(counts)

In [None]:
from collections import Counter

In [None]:
counts = Counter(time_zones)

In [None]:
counts.most_common(10)

### Counting Time Zones with pandas

In [None]:
import pandas as pd

In [None]:
frame = pd.DataFrame(records)

In [None]:
frame.info()

In [None]:
frame['tz'][:10]

In [None]:
tz_counts = frame['tz'].value_counts()

In [None]:
tz_counts[:10]

In [None]:
import seaborn as sns

In [None]:
subset = tz_counts[:10]

In [None]:
sns.barplot(y=subset.index,x=subset.values)

In [None]:
frame['a'][1]

In [None]:
frame['a'][50]

In [None]:
frame['a'][51][:50]

In [None]:
results = pd.Series([x.split()[0] for x in frame.a.dropna()])

In [None]:
results[:5]

In [None]:
results.value_counts()[:8]

In [None]:
cframe = frame[frame.a.notnull()]

In [None]:
import numpy as np

In [None]:
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),
                       'Windows','Not Windows')

In [None]:
cframe['os'][:5]

In [None]:
by_tz_os = cframe.groupby(['tz','os'])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [None]:
agg_counts[:10]

In [None]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()

In [None]:
indexer[:10]

In [None]:
count_subset = agg_counts.take(indexer[-10:])

In [None]:
count_subset

In [None]:
agg_counts.sum(1).nlargest(10)

In [None]:
# Rearrange the data for plotting
count_subset = count_subset.stack()

In [None]:
count_subset.name = 'total'

In [None]:
count_subset = count_subset.reset_index()

In [None]:
count_subset[:10]

In [None]:
sns.barplot(x='total',y='tz',hue='os',data=count_subset)

In [None]:
def norm_total(group):
    group['normed_total']=group.total/group.total.sum()
    return group

In [None]:
results = count_subset.groupby('tz').apply(norm_total)

In [None]:
sns.barplot(x='normed_total',y='tz',hue='os',data=results)

In [None]:
g = count_subset.groupby('tz')

In [None]:
results2 = count_subset.total / g.total.transform('sum')

## MovieLens 1M Dataset

In [None]:
import pandas as pd

In [None]:
# Make display smaller
pd.options.display.max_rows = 10

In [None]:
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('../datasets/movielens/users.dat',sep='::',
                     header=None, names=unames)

In [None]:
rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('../datasets/movielens/ratings.dat',sep='::',
                       header=None, names=rnames)

In [None]:
mnames = ['movie_id','title','genres']
movies = pd.read_table('../datasets/movielens/movies.dat', sep='::',
                      header=None, names=mnames)

In [None]:
users[:5]

In [None]:
ratings[:5]

In [None]:
movies[:5]

In [None]:
ratings

In [None]:
data = pd.merge(pd.merge(ratings,users),movies)

In [None]:
data

In [None]:
data.iloc[0]

In [None]:
mean_ratings = data.pivot_table('rating',index='title',
                                columns='gender', aggfunc='mean')

In [None]:
mean_ratings[:5]

In [None]:
ratings_by_title = data.groupby('title').size()

In [None]:
ratings_by_title[:10]

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [None]:
active_titles

In [None]:
# Select rows on the index
mean_ratings = mean_ratings.loc[active_titles]

In [None]:
mean_ratings

In [None]:
top_female_ratings = mean_ratings.sort_values(by='F',ascending=False)

In [None]:
top_female_ratings[:10]

### Measuring Rating Disagreement

In [None]:
mean_ratings['diff'] = mean_ratings['M']-mean_ratings['F']

In [None]:
sorted_by_diff = mean_ratings.sort_values(by='diff')

In [None]:
sorted_by_diff[:10]

In [None]:
# Reverse order of rows, take first 10 rows
sorted_by_diff[::-1][:10]

In [None]:
#Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()

In [None]:
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_titles]

In [None]:
# Order Series by value in descending order
rating_std_by_title.sort_values(ascending=False)[:10]

## US Baby Names 1880-2010

In [None]:
!head -n 10 ../datasets/babynames/yob1880.txt

In [None]:
import pandas as pd

In [None]:
names1880 = pd.read_csv('../datasets/babynames/yob1880.txt',
                       names=['name','sex','births'])

In [None]:
names1880

In [None]:
names1880.groupby('sex').births.sum()

In [None]:
years = range(1880,2011)
pieces = []
columns = ['names','sex','births']

In [None]:
for year in years:
    path = '../datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path,names=columns)
    
    frame['year'] = year
    pieces.append(frame)

In [None]:
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)

In [None]:
names

In [None]:
total_births = names.pivot_table('births',index='year',
                                 columns='sex',aggfunc=sum)

In [None]:
total_births.tail()

In [None]:
total_births.plot(title='Total births by sex and year')

In [None]:
def add_prop(group):
    group['prop']= group.births / group.births.sum()
    return group
names = names.groupby(['year','sex']).apply(add_prop)

In [None]:
names

In [None]:
names.groupby(['year','sex']).prop.sum()

In [None]:
def get_top1000(group):
    return group.sort_values(by='births',ascending=False)[:1000]
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
# Drop the group index, not needed
top1000.reset_index(inplace=True,drop=True)

In [None]:
pieces = []
for year, group in names.groupby(['year','sex']):
    pieces.append(group.sort_values(by='births',ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index = True)

In [None]:
top1000

### Analyzing Naming Trends

In [None]:
boys = top1000[top1000.sex == 'M']

In [None]:
girls = top1000[top1000.sex == 'F']

In [None]:
total_births = top1000.pivot_table('births', index='year',
                                   columns='names',
                                   aggfunc=sum)

In [None]:
total_births.info()

In [None]:
subset = total_births[['John','Harry','Mary','Marilyn']]

In [None]:
subset.plot(subplots=True,figsize=(12,10),grid=False,
           title="Number of births per year")

#### Measuring the increase in naming diversity

In [None]:
table = top1000.pivot_table('prop', index = 'year',
                           columns = 'sex', aggfunc=sum)

In [None]:
table.plot(title='Sum of table1000.prop by year and sex',
          yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))

In [None]:
df = boys[boys.year==2010]

In [None]:
df

In [None]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()

In [None]:
prop_cumsum[:10]

In [None]:
prop_cumsum.values.searchsorted(0.5)

In [None]:
df = boys[boys.year==1900]

In [None]:
in1900 = df.sort_values(by='prop',ascending=False).prop.cumsum()

In [None]:
in1900.values.searchsorted(0.5)+1

## USDA Food Database

## 2012 Federal Election Commission Database

### Donation Statistics by Occupation and Employer

### Bucketing Donation Amounts

### Donation Statistics by State