### 3장 : Pandas 입문

# Pandas 데이터프레임

In [None]:
import numpy as np
import scipy as sp
import pandas as pd

## 데이터프레임으로 파일 불러오기

In [None]:
from pandas.io.parsers import read_csv

df = read_csv("WHO_first9cols.csv")
print("Dataframe:\n", df)

In [None]:
print("Shape:\n", df.shape)
print("\n")
print("Length:\n", len(df))
print("\n")
print("Column Headers:\n", df.columns)
print("\n")
print("Data types:\n", df.dtypes)
print("\n")
print("Index:\n", df.index)
print("\n")
print("Values:\n", df.values)

# Pandas 시리즈(Series)

In [None]:
country_col = df["Country"]
print("Type df:\n", type(df), "\n")
print("Type country col:\n", type(country_col), "\n")

In [None]:
print("Series shape:\n", country_col.shape, "\n")
print("Series index:\n", country_col.index, "\n")
print("Series values:\n", country_col.values, "\n")
print("Series name:\n", country_col.name, "\n")

In [None]:
print("Last 2 countries:\n", country_col[-2:], "\n")
print("Last 2 countries type:\n", type(country_col[-2:]), "\n")

In [None]:
last_col = df.columns[-1]
print("Last df column signs:\n", last_col, np.sign(df[last_col]), "\n")

In [None]:
np.sum([0, np.nan])

In [None]:
df.dtypes

In [None]:
print(np.sum(df[last_col] - df[last_col].values))

# Pandas 데이터 검색

In [None]:
import quandl

In [None]:
sunspots = quandl.get("SIDC/SUNSPOTS_A")

In [None]:
print("Head 2:\n", sunspots.head(2) )

In [None]:
print("Tail 2:\n", sunspots.tail(2))

In [None]:
last_date = sunspots.index[-1]
print("Last value:\n",sunspots.loc[last_date])

In [None]:
print("Values slice by date:\n", sunspots["20020101": "20131231"])

In [None]:
print("Slice from a list of indices:\n", sunspots.iloc[[2, 4, -4, -2]])

In [None]:
print("Scalar with Iloc:", sunspots.iloc[0, 0])
print("Scalar with iat", sunspots.iat[1, 0])

In [None]:
print("Boolean selection:\n", sunspots[sunspots > sunspots.mean()])

In [None]:
print("Boolean selection with column label:\n", sunspots[sunspots['Number of Observations'] > sunspots['Number of Observations'].mean()])

# Pandas 데이터프레임과 통계학

In [None]:
import quandl

# http://www.quandl.com/SIDC/SUNSPOTS_A-Sunspot-Numbers-Annual 에서 데이터 다운받음
# PyPi url https://pypi.python.org/pypi/Quandl
sunspots = quandl.get("SIDC/SUNSPOTS_A")
print("Describe", sunspots.describe(),"\n")
print("Non NaN observations", sunspots.count(),"\n")
print("MAD", sunspots.mad(),"\n")
print("Median", sunspots.median(),"\n")
print("Min", sunspots.min(),"\n")
print("Max", sunspots.max(),"\n")
print("Mode", sunspots.mode(),"\n")
print("Standard Deviation", sunspots.std(),"\n")
print("Variance", sunspots.var(),"\n")
print("Skewness", sunspots.skew(),"\n")
print("Kurtosis", sunspots.kurt(),"\n")

# 데이터 수집

In [None]:
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import randint
import numpy as np

seed(42)

df = pd.DataFrame({'Weather' : ['cold', 'hot', 'cold', 'hot',
   'cold', 'hot', 'cold'],
   'Food' : ['soup', 'soup', 'icecream', 'chocolate',
   'icecream', 'icecream', 'soup'],
   'Price' : 10 * rand(7), 'Number' : randint(1, 9)})

print(df)

In [None]:
weather_group = df.groupby('Weather')

i = 0

for name, group in weather_group:
   i = i + 1
   print("Group", i, name)
   print(group)

In [None]:
print("Weather group first\n", weather_group.first())
print("Weather group last\n", weather_group.last())
print("Weather group mean\n", weather_group.mean())

In [None]:
wf_group = df.groupby(['Weather', 'Food'])
print("WF Groups", wf_group.groups)

In [None]:
print("WF Aggregated\n", wf_group.agg([np.mean, np.median]))

# 데이터프레임 연쇄와 추가

In [None]:
print("df :3\n", df[:3])

In [None]:
print("Concat Back together\n", pd.concat([df[:3], df[3:]]))

In [None]:
print("Appending rows\n", df[:3].append(df[5:]))

# 데이터프레임 조인(join)

In [None]:
dests = pd.read_csv('dest.csv')
print("Dests\n", dests)

tips = pd.read_csv('tips.csv')
print("Tips\n", tips)

print("Merge() on key\n", pd.merge(dests, tips, on='EmpNr'))
print("Dests join() tips\n", dests.join(tips, lsuffix='Dest', rsuffix='Tips'))

print("Inner join with merge()\n", pd.merge(dests, tips, how='inner'))
print("Outer join\n", pd.merge(dests, tips, how='outer'))

# 누락된 데이터 다루기

In [None]:
df = pd.read_csv('WHO_first9cols.csv')
# 첫번째 3개 나라 목록과 전체 초등학교 남자 입학 비율(%)을 선택한다.
df = df[['Country', df.columns[-2]]][:2]
print("New df\n", df)
print("Null Values\n", pd.isnull(df))
print("Total Null Values\n", pd.isnull(df).sum())
print("Not Null Values\n", df.notnull())
print("Last Column Doubled\n", 2 * df[df.columns[-1]])
print("Last Column plus NaN\n", df[df.columns[-1]] + np.nan)
print("Zero filled\n", df.fillna(0))

# 날짜 다루기

In [None]:
print("Date range", pd.date_range('1/1/1900', periods=42, freq='D'))

In [None]:
import sys
try:
   print("Date range", pd.date_range('1/1/1677', periods=4, freq='D'))
except:
   etype, value, _ = sys.exc_info()
   print("Error encountered", etype, value)

In [None]:
offset = pd.DateOffset(seconds=2 ** 33/10 ** 9)
mid = pd.to_datetime('1/1/1970')
print("Start valid range", mid - offset)
print("End valid range", mid + offset)

In [None]:
print("With format", pd.to_datetime(['19021112', '19031230'], format='%Y%m%d'))

In [None]:
# 날짜가 아닌 문자열은 변환이 안된다. errors='coerce'를 추가해보자.
print("Illegal date", pd.to_datetime(['1902-11-12', 'not a date']) )

In [None]:
print("Illegal date coerced", pd.to_datetime(['1902-11-12', 'not a date'], errors='coerce'))

# 피벗 테이블

In [None]:
seed(42)
N = 7
df = pd.DataFrame({
   'Weather' : ['cold', 'hot', 'cold', 'hot',
   'cold', 'hot', 'cold'],
   'Food' : ['soup', 'soup', 'icecream', 'chocolate',
   'icecream', 'icecream', 'soup'],
   'Price' : 10 * rand(N), 'Number' : randint(1, 9)})

In [None]:
print("DataFrame\n", df)

In [None]:
print(pd.pivot_table(df, columns=['Food'], aggfunc=np.sum))