# 2-4 Pandasの基礎（Numpyを使いやすくしたもの）
## 2-4-1 Pandasライブラリのインポート

In [4]:
#この章で使用するライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
import pandas as pd
from pandas import Series, DataFrame

# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

#小数点3位までの表示
%precision 3

'%.3f'

## 2-4-2 Seriesの使い方

In [5]:
#Seriesの使い方
sample_pandas_data = pd.Series([0, 10, 20, 30, 40, 50, 60, 70, 80, 90])
print(sample_pandas_data)

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: int64


In [6]:
#indexをアルファベットでつける
sample_pandas_index_data = pd.Series(
    [0,10,20,30,40,50,60,70,80,90],
    index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
)
print(sample_pandas_index_data)

a     0
b    10
c    20
d    30
e    40
f    50
g    60
h    70
i    80
j    90
dtype: int64


In [7]:
print('データの値：', sample_pandas_index_data.values)
print('indexの値：', sample_pandas_index_data.index)

データの値： [ 0 10 20 30 40 50 60 70 80 90]
indexの値： Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')


## 2-4-3 DataFrameの使い方

In [8]:
#データフレームの使い方
attri_data1 = {
    'ID':['100','101','102','103','104'],
    'City':['Tokyo', 'Osaka', 'Kyoto', 'Hokkaido', 'Tokyo'],
    'Birth_year':[1990,1989,1992,1997,1982],
    'Name':['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steve']}
attri_data_frame1 = DataFrame(attri_data1)
print(attri_data_frame1)

    ID      City  Birth_year     Name
0  100     Tokyo        1990  Hiroshi
1  101     Osaka        1989    Akiko
2  102     Kyoto        1992     Yuki
3  103  Hokkaido        1997   Satoru
4  104     Tokyo        1982    Steve


In [9]:
attri_data_frame_index1 = DataFrame(attri_data1, index = ['a', 'b', 'c', 'd', 'e'])
print(attri_data_frame_index1)

    ID      City  Birth_year     Name
a  100     Tokyo        1990  Hiroshi
b  101     Osaka        1989    Akiko
c  102     Kyoto        1992     Yuki
d  103  Hokkaido        1997   Satoru
e  104     Tokyo        1982    Steve


### 3-1 Jupyter環境におけるデータ表示

In [10]:
attri_data_frame_index1

Unnamed: 0,ID,City,Birth_year,Name
a,100,Tokyo,1990,Hiroshi
b,101,Osaka,1989,Akiko
c,102,Kyoto,1992,Yuki
d,103,Hokkaido,1997,Satoru
e,104,Tokyo,1982,Steve


## 2-4-4 行列操作
### 4-1 転置

In [11]:
#ここから行列の操作
#転置（transposition)
attri_data_frame1.T

Unnamed: 0,0,1,2,3,4
ID,100,101,102,103,104
City,Tokyo,Osaka,Kyoto,Hokkaido,Tokyo
Birth_year,1990,1989,1992,1997,1982
Name,Hiroshi,Akiko,Yuki,Satoru,Steve


### 4-2 特定行のみの取り出し

In [12]:
#特定の列のみを取り出す
attri_data_frame1.Birth_year

0    1990
1    1989
2    1992
3    1997
4    1982
Name: Birth_year, dtype: int64

In [13]:
#複数の列を指定したい場合
attri_data_frame1[['ID', 'Birth_year']]

Unnamed: 0,ID,Birth_year
0,100,1990
1,101,1989
2,102,1992
3,103,1997
4,104,1982


## 2-4-5 データの抽出

In [14]:
#データの抽出（条件付き取り出し）
attri_data_frame1[attri_data_frame1['City'] == 'Tokyo']

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
4,104,Tokyo,1982,Steve


In [15]:
#複数の条件を適用したいとき
attri_data_frame1[attri_data_frame1['City'].isin(['Tokyo', 'Osaka'])]

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
4,104,Tokyo,1982,Steve


In [16]:
attri_data_frame1[attri_data_frame1['Birth_year'] < 1990]

Unnamed: 0,ID,City,Birth_year,Name
1,101,Osaka,1989,Akiko
4,104,Tokyo,1982,Steve


## 2-4-6 データの削除と結合
### 6-1 列や行の削除

In [18]:
attri_data_frame1

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
2,102,Kyoto,1992,Yuki
3,103,Hokkaido,1997,Satoru
4,104,Tokyo,1982,Steve


In [19]:
#列や行の削除
# axis = 0で行、1で列を削除
attri_data_frame1.drop(['Birth_year'], axis = 1)
#inplace=Trueをオプションで設定すれば完全な削除も可能

Unnamed: 0,ID,City,Name
0,100,Tokyo,Hiroshi
1,101,Osaka,Akiko
2,102,Kyoto,Yuki
3,103,Hokkaido,Satoru
4,104,Tokyo,Steve


In [21]:
# inplace=Trueにしていないので元データの削除はされていない
attri_data_frame1

Unnamed: 0,ID,City,Birth_year,Name
0,100,Tokyo,1990,Hiroshi
1,101,Osaka,1989,Akiko
2,102,Kyoto,1992,Yuki
3,103,Hokkaido,1997,Satoru
4,104,Tokyo,1982,Steve


### 6-2 データの結合

In [23]:
#データの結合
#別のデータの準び
attri_data2 = {
    'ID':['100','101','102','105','107'],
    'Math':[50,43,33,76,98],
    'English':[90,30,20,50,30],
    'Sex':['M', 'F', 'F', 'M', 'M']}
# 辞書型データ配列をDataFrameへ変換
attri_data_frame2 = DataFrame(attri_data2)
attri_data_frame2

Unnamed: 0,ID,Math,English,Sex
0,100,50,90,M
1,101,43,30,F
2,102,33,20,F
3,105,76,50,M
4,107,98,30,M


In [24]:
#データをマージする（内部結合、詳しい説明が6章にあるらしい）
pd.merge(attri_data_frame1, attri_data_frame2)

Unnamed: 0,ID,City,Birth_year,Name,Math,English,Sex
0,100,Tokyo,1990,Hiroshi,50,90,M
1,101,Osaka,1989,Akiko,43,30,F
2,102,Kyoto,1992,Yuki,33,20,F


## 2-4-7　集計

In [29]:
#データのグループ集計
attri_data_frame2.groupby('Sex')['English'].mean()

Sex
F    25.000000
M    56.666667
Name: English, dtype: float64

In [31]:
attri_data_frame2.groupby('Sex')['Math'].max()

Sex
F    43
M    98
Name: Math, dtype: int64

In [32]:
attri_data_frame2['Math'].min()

33

## 2-4-8 値のソート

In [26]:
#値のソート
#データの準備

attri_data3 = {
    'ID':['100','101','102','103','104'],
    'City':['Tokyo', 'Osaka', 'Kyoto', 'Hokkaido', 'Tokyo'],
    'Birth_year':[1990,1989,1992,1997,1982],
    'Name':['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steve']}
attri_data_frame3 = DataFrame(attri_data3)
attri_data_frame_index3 = DataFrame(attri_data3,  index = ['e', 'b', 'a', 'd', 'c'])
attri_data_frame_index3

Unnamed: 0,ID,City,Birth_year,Name
e,100,Tokyo,1990,Hiroshi
b,101,Osaka,1989,Akiko
a,102,Kyoto,1992,Yuki
d,103,Hokkaido,1997,Satoru
c,104,Tokyo,1982,Steve


In [33]:
#indexによるソート
attri_data_frame_index3.sort_index()

Unnamed: 0,ID,City,Birth_year,Name
a,102,Kyoto,1992,Yuki
b,101,Osaka,1989,Akiko
c,104,Tokyo,1982,Steve
d,103,Hokkaido,1997,Satoru
e,100,Tokyo,1990,Hiroshi


In [34]:
#値によるソート、デフォルトは昇順
attri_data_frame_index3.Birth_year.sort_values()

c    1982
b    1989
e    1990
a    1992
d    1997
Name: Birth_year, dtype: int64

## 2-4-9 nan(null)の判定
### 9-1 条件に合致したデータの比較

In [37]:
#nan(nullの判定
#その値が存在するか確認
attri_data_frame_index3.isin(['Tokyo'])

Unnamed: 0,ID,City,Birth_year,Name
e,False,True,False,False
b,False,False,False,False
a,False,False,False,False
d,False,False,False,False
c,False,True,False,False


### 9-2 nanとnullの例

In [38]:
#nanとnullの例
#欠損地の取り扱い、nameを全てnanにしてみる
attri_data_frame_index3['Name'] = np.nan
attri_data_frame_index3

Unnamed: 0,ID,City,Birth_year,Name
e,100,Tokyo,1990,
b,101,Osaka,1989,
a,102,Kyoto,1992,
d,103,Hokkaido,1997,
c,104,Tokyo,1982,


In [211]:
#nullを判定して合計する
attri_data_frame_index3.isnull().sum()

ID            0
City          0
Birth_year    0
Name          5
dtype: int64

## ここからは練習問題

In [39]:
#練習問題2-7：金額500以上を抜き出して表示
attri_data4 = {
    'ID':['1','2','3','4','5'],
    'Sex':['F', 'F', 'M', 'M', 'F'],
    'Money':[1000, 2000, 500, 300, 700],
    'Name':['Hiroshi', 'Akiko', 'Yuki', 'Satoru', 'Steve']}
attri_data_frame4 = DataFrame(attri_data4)
attri_data_frame4

Unnamed: 0,ID,Sex,Money,Name
0,1,F,1000,Hiroshi
1,2,F,2000,Akiko
2,3,M,500,Yuki
3,4,M,300,Satoru
4,5,F,700,Steve


In [40]:
attri_data_frame4[attri_data_frame4['Money'] > 500]

Unnamed: 0,ID,Sex,Money,Name
0,1,F,1000,Hiroshi
1,2,F,2000,Akiko
4,5,F,700,Steve


In [41]:
#練習問題2-8：男女別の平均金額
attri_data_frame4.groupby('Sex')['Money'].mean()

Sex
F    1233.333333
M     400.000000
Name: Money, dtype: float64

In [42]:
#練習2-9：2-7のデータにまーじ
attri_data5 = {
    'ID':['3', '4', '7'],
    'Math':[60,30,40],
    'English':[80,20,30]}
attri_data_frame5 = DataFrame(attri_data5)
attri_data_frame5

Unnamed: 0,ID,Math,English
0,3,60,80
1,4,30,20
2,7,40,30


In [43]:
pd.merge(attri_data_frame4, attri_data_frame5)

Unnamed: 0,ID,Sex,Money,Name,Math,English
0,3,M,500,Yuki,60,80
1,4,M,300,Satoru,30,20
