# Beginning Data Analysis 

In [None]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

## Introduction

## Developing a data analysis routine

### How to do it...

In [None]:
college = pd.read_csv('data/college.csv')
college.sample(random_state=42)

In [None]:
college.shape

In [None]:
college.info()

In [None]:
college.describe(include=[np.number]).T

In [None]:
college.describe(include=[np.object, pd.Categorical]).T

### How it works...

### There's more...

In [None]:
college.describe(include=[np.number],
   percentiles=[.01, .05, .10, .25, .5,
                .75, .9, .95, .99]).T

## Data dictionaries

In [None]:
pd.read_csv('data/college_data_dictionary.csv')

## Reducing memory by changing data types

### How to do it...

In [None]:
college = pd.read_csv('data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',
   'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
col2.head()

In [None]:
col2.dtypes

In [None]:
original_mem = col2.memory_usage(deep=True)
original_mem

In [None]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    

In [None]:
col2.dtypes

In [None]:
college[different_cols].memory_usage(deep=True)

In [None]:
col2.select_dtypes(include=['object']).nunique()

In [None]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

In [None]:
new_mem = col2.memory_usage(deep=True)
new_mem

In [None]:
new_mem / original_mem

### How it works...

### There's more...

In [None]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

In [None]:
college['MENONLY'].dtype

In [None]:
college['MENONLY'].astype(np.int8)

In [None]:
college.assign(MENONLY=college['MENONLY'].astype('float16'),
    RELAFFIL=college['RELAFFIL'].astype('int8'))

In [None]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage() # previously was just 80

## Selecting the smallest of the largest

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

In [None]:
movie2.nlargest(100, 'imdb_score').head()

In [None]:
(movie2
  .nlargest(100, 'imdb_score')
  .nsmallest(5, 'budget')
)

### How it works...

### There's more...

## Selecting the largest of each group by sorting

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
movie[['movie_title', 'title_year', 'imdb_score']]

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values('title_year', ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
  .drop_duplicates(subset='title_year')
)

### How it works...

## There's more...

In [None]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .groupby('title_year', as_index=False)
  .apply(lambda df: df.sort_values('imdb_score',
         ascending=False).head(1))
  .sort_values('title_year', ascending=False)
)

In [None]:
(movie
  [['movie_title', 'title_year',
    'content_rating', 'budget']]
   .sort_values(['title_year',
       'content_rating', 'budget'],
       ascending=[False, False, True])
   .drop_duplicates(subset=['title_year',
        'content_rating'])
)

## Replicating nlargest with sort_values

### How to do it...

In [None]:
movie = pd.read_csv('data/movie.csv')
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score') 
   .nsmallest(5, 'budget')
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100)
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100) 
   .sort_values('budget')
   .head(5)
)

### How it works...

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score')
   .tail()
)

In [None]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False) 
   .head(100)
   .tail()
)

## Calculating a trailing stop order price

### How to do it...

In [None]:
import datetime
import pandas_datareader.data as web
import requests_cache
session = requests_cache.CachedSession(
   cache_name='cache', backend='sqlite', 
   expire_after=datetime.timedelta(days=90))

In [None]:
tsla = web.DataReader('tsla', data_source='yahoo',
   start='2017-1-1', session=session)
tsla.head(8)

In [None]:
tsla_close = tsla['Close']

In [None]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

In [None]:
(tsla
  ['Close']
  .cummax()
  .mul(.9)
  .head()
)

### How it works...

### There's more...