pandas : powerful Python data analysis toolkit

pandasは、データ分析を効率よくしてくれるpythonライブラリ

In [1]:
#pandasのインポート
import pandas as pd

# DataFrameの操作

列:column, 行:row

##  Seriesオブジェクト

一次元ベクトルにラベルが付いたもの

In [2]:
s = pd.Series([1,2,3], index = ["a","b", "c"])
s

a    1
b    2
c    3
dtype: int64

In [3]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

## Seriesの基本操作

In [4]:
s["b"]

2

In [5]:
s[0]

1

In [6]:
1 in s

False

In [7]:
"a" in s

True

In [10]:
#s["d"]

In [11]:
print(s.get("d"))

None


In [12]:
s + s

a    2
b    4
c    6
dtype: int64

In [13]:
s*2

a    2
b    4
c    6
dtype: int64

In [14]:
import numpy as np
np.exp(s)

a     2.718282
b     7.389056
c    20.085537
dtype: float64

In [16]:
#片方にラベルが存在しない場合は、NaNを返す
s[1:]+s[:-1]

a    NaN
b    4.0
c    NaN
dtype: float64

In [17]:
s.name

In [18]:
s.name = "my series"

In [20]:
s

a    1
b    2
c    3
Name: my series, dtype: int64

In [21]:
s = s.rename("my series 2")
s

a    1
b    2
c    3
Name: my series 2, dtype: int64

In [22]:
s = pd.Series([1,2,3], index = ["a","b", "c"], name="my series 3")
s

a    1
b    2
c    3
Name: my series 3, dtype: int64

## DataFrameの使い方

Seriesを縦に並ぶ一列のベクトルと考えて、行列を構成する。

Series.nameが列(column)名に、Series.indexが行(row)のindexや名前になると考える

In [23]:
#辞書型からDataFrameを生成
df = pd.DataFrame(
    {"math": pd.Series([90,70,80], index = ["taro", "koji", "ryota"]),
    "science":pd.Series([40,70,60], index = ["taro", "koji", "ryota"])
    })

In [24]:
df

Unnamed: 0,math,science
taro,90,40
koji,70,70
ryota,80,60


In [25]:
df = pd.DataFrame(df, index = ["koji", "taro"])
df

Unnamed: 0,math,science
koji,70,70
taro,90,40


In [26]:
df.columns

Index(['math', 'science'], dtype='object')

In [27]:
df.index

Index(['koji', 'taro'], dtype='object')

In [28]:
#配列から生成
array = [
    [90,40],
    [70,70],
    [80,60]
]

df = pd.DataFrame(array, index = ["taro", "koji", "ryota"], columns = ["math", "science"])
df

Unnamed: 0,math,science
taro,90,40
koji,70,70
ryota,80,60


In [30]:
#複数カテゴリ
pd.DataFrame({('a', 'b'): {('A', 'C'): 1, ('A', 'D'): 2},
                            ('a', 'a'): {('A', 'C'): 3, ('A', 'D'): 4},
                            ('a', 'c'): {('A', 'C'): 5, ('A', 'D'): 6},
                            ('b', 'a'): {('B', 'C'): 7, ('B', 'D'): 8},
                            ('b', 'b'): {('B', 'C'): 9, ('B', 'D'): 10},
                            ('b', 'c'): {('B', 'C'): 11, ('B', 'D'): 12}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,a,b,c
A,C,3.0,1.0,5.0,,,
A,D,4.0,2.0,6.0,,,
B,C,,,,7.0,9.0,11.0
B,D,,,,8.0,10.0,12.0


In [31]:
#columnの削除
del df["math"]
df

Unnamed: 0,science
taro,40
koji,70
ryota,60


In [40]:
df = pd.DataFrame(array, index = ["taro", "koji", "ryota"], columns = ["math", "science"])

In [41]:
p = df.pop("math")
p

taro     90
koji     70
ryota    80
Name: math, dtype: int64

In [42]:
df.insert(0, "math1", p)
df

Unnamed: 0,math1,science
taro,90,40
koji,70,70
ryota,80,60


In [43]:
df.insert(1, "math2", p)
df

Unnamed: 0,math1,math2,science
taro,90,90,40
koji,70,70,70
ryota,80,80,60


In [44]:
df = df.assign(math_sum = lambda x: x["math1"] + x["math2"])
df

Unnamed: 0,math1,math2,science,math_sum
taro,90,90,40,180
koji,70,70,70,140
ryota,80,80,60,160


In [45]:
df.query("math_sum > 150 and science > 50")

Unnamed: 0,math1,math2,science,math_sum
ryota,80,80,60,160


In [47]:
science = df.pop("science")

In [48]:
df.insert(2, "science1", science)
df.insert(3, "science2", [50, 70, 20])
df

Unnamed: 0,math1,math2,science1,science2,math_sum
taro,90,90,40,50,180
koji,70,70,70,70,140
ryota,80,80,60,20,160


## 要素の抽出(loc, ilocの使い方)

loc : ラベルで取得

iloc : 数で取得

In [49]:
#行で取得
df.loc[["taro", "ryota"]]

Unnamed: 0,math1,math2,science1,science2,math_sum
taro,90,90,40,50,180
ryota,80,80,60,20,160


In [50]:
df.loc[:, ["math1", "science1"]]

Unnamed: 0,math1,science1
taro,90,40
koji,70,70
ryota,80,60


In [51]:
df.loc[["taro", "ryota"], ["math1", "science1"]]

Unnamed: 0,math1,science1
taro,90,40
ryota,80,60


In [52]:
df.iloc[[1, 2]]

Unnamed: 0,math1,math2,science1,science2,math_sum
koji,70,70,70,70,140
ryota,80,80,60,20,160


In [53]:
df.iloc[[1,2], [3]]

Unnamed: 0,science2
koji,70
ryota,20


In [54]:
df[1:3]

Unnamed: 0,math1,math2,science1,science2,math_sum
koji,70,70,70,70,140
ryota,80,80,60,20,160


In [56]:
#DataFrameのマージ
pd.concat([df.iloc[0:2],df.iloc[1:3]], axis=0)

Unnamed: 0,math1,math2,science1,science2,math_sum
taro,90,90,40,50,180
koji,70,70,70,70,140
koji,70,70,70,70,140
ryota,80,80,60,20,160


In [57]:
pd.concat([df.iloc[0:2],df.iloc[1:3]], axis=1)

Unnamed: 0,math1,math2,science1,science2,math_sum,math1.1,math2.1,science1.1,science2.1,math_sum.1
koji,70.0,70.0,70.0,70.0,140.0,70.0,70.0,70.0,70.0,140.0
ryota,,,,,,80.0,80.0,60.0,20.0,160.0
taro,90.0,90.0,40.0,50.0,180.0,,,,,


In [58]:
#転置
df.T

Unnamed: 0,taro,koji,ryota
math1,90,70,80
math2,90,70,80
science1,40,70,60
science2,50,70,20
math_sum,180,140,160


## 基礎統計

In [59]:
df.describe()

Unnamed: 0,math1,math2,science1,science2,math_sum
count,3.0,3.0,3.0,3.0,3.0
mean,80.0,80.0,56.666667,46.666667,160.0
std,10.0,10.0,15.275252,25.166115,20.0
min,70.0,70.0,40.0,20.0,140.0
25%,75.0,75.0,50.0,35.0,150.0
50%,80.0,80.0,60.0,50.0,160.0
75%,85.0,85.0,65.0,60.0,170.0
max,90.0,90.0,70.0,70.0,180.0


In [61]:
df.count()

math1       3
math2       3
science1    3
science2    3
math_sum    3
dtype: int64

In [62]:
df.std()

math1       10.000000
math2       10.000000
science1    15.275252
science2    25.166115
math_sum    20.000000
dtype: float64

In [63]:
df.mean()

math1        80.000000
math2        80.000000
science1     56.666667
science2     46.666667
math_sum    160.000000
dtype: float64

In [64]:
df.corr()

Unnamed: 0,math1,math2,science1,science2,math_sum
math1,1.0,1.0,-0.981981,-0.39736,1.0
math2,1.0,1.0,-0.981981,-0.39736,1.0
science1,-0.981981,-0.981981,1.0,0.216777,-0.981981
science2,-0.39736,-0.39736,0.216777,1.0,-0.39736
math_sum,1.0,1.0,-0.981981,-0.39736,1.0
