# An introduction to Pandas  
Pandasの紹介です．

In [59]:
import numpy as np

# pandas の import 分はこのまま覚えましょう
from pandas import Series, DataFrame
import pandas as pd

In [5]:
obj = Series([4, 7, -5, 3])

In [8]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [11]:
# Seriesの持つ値をarrayとして扱いたい場合はこれ
obj.values

array([ 4,  7, -5,  3])

In [16]:
# indexはデフォルトで RangeIndex で作成される
obj.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
# index を 指定したい場合
obj2 = Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])

In [15]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [17]:
# Numpyと同じようにアクセスできる
obj2[0]

4

In [19]:
# []の中に文字列でindexを指定することも可能
obj2['d']

4

In [20]:
# indexのリストを与えることでそのように並び替えた状態で返すことも可能
obj2[['a', 'b', 'c']]

a   -5
b    7
c    3
dtype: int64

In [23]:
obj2[:3]

d    4
b    7
a   -5
dtype: int64

In [24]:
# 計算もNumpy同様 
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [25]:
# dictionary data の場合の記述  
sdata = {'Ohio':35000,
         'Texas':71000,
         'Oregon':16000,
         'Utah':5000}

In [28]:
# dict型のデータをSeriesに渡すと，　index:value という対応で Series型の配列を返す
obj3 = Series(sdata)

In [29]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [35]:
# Seriesに変換するときに， dictを渡して indexを指定すると， 
# dictに存在する key はその通りに， 存在しないkeyはについては，　エラーではなく NaNで返ってくる
states = ['California', 'Ohio', 'Oregon', 'Texas']
Series(sdata, index = states)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [36]:
obj4 = Series(sdata, index = states)

In [38]:
# 値の存在を確認
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [39]:
# 逆操作はこんな感じ
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [40]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [41]:
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [45]:
# 本質的な名前をつけられる
obj4.name = 'population'

In [43]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [46]:
# index の意味を追加できる
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

# DataFrame  
データ分析で一番よく使う型．データフレーム．  
行列は，全ての要素が同じ型である必要があったが，データフレームは列ごとにデータ型が異なってもOK

In [47]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
        }
frame = DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [49]:
# 作成するときに列の順番を指定する
DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [51]:
frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                   index = ['one', 'two', 'three', 'four', 'five'])

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [54]:
# Seriesのfill method
obj = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
obj

0      blue
2    purple
4    yellow
dtype: object

In [55]:
obj.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [57]:
# blueで10， purpleで20, yellowで３０個の要素をもたせたいとき
obj = Series(['blue', 'purple', 'yellow'], index = [0, 10, 20])
obj.reindex(range(60), method = 'ffill')

0       blue
1       blue
2       blue
3       blue
4       blue
5       blue
6       blue
7       blue
8       blue
9       blue
10    purple
11    purple
12    purple
13    purple
14    purple
15    purple
16    purple
17    purple
18    purple
19    purple
20    yellow
21    yellow
22    yellow
23    yellow
24    yellow
25    yellow
26    yellow
27    yellow
28    yellow
29    yellow
30    yellow
31    yellow
32    yellow
33    yellow
34    yellow
35    yellow
36    yellow
37    yellow
38    yellow
39    yellow
40    yellow
41    yellow
42    yellow
43    yellow
44    yellow
45    yellow
46    yellow
47    yellow
48    yellow
49    yellow
50    yellow
51    yellow
52    yellow
53    yellow
54    yellow
55    yellow
56    yellow
57    yellow
58    yellow
59    yellow
dtype: object

---

In [60]:
frame = DataFrame(np.arange(9).reshape((3,3)),
                  index = ['a', 'c', 'd'],
                  columns = ['Ohio', 'Texas', 'California'])

In [61]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [64]:
# 存在しないインデックスはNaNとなる
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [65]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [72]:
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [74]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [75]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [78]:
data = DataFrame(
    np.arange(16).reshape((4,4)),
    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
    columns = ['one', 'two',' three', 'four']
)

data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [79]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [80]:
data.drop('two', axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [81]:
data.drop(['two','four'], axis = 1)

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14
