## dataFrameの再学習

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [6]:
# gapminderでの学ぶ
df = px.data.gapminder()

In [8]:
# データ型
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
iso_alpha     object
iso_num        int64
dtype: object

In [10]:
# データの詳細を把握
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
 6   iso_alpha  1704 non-null   object 
 7   iso_num    1704 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 106.6+ KB


In [30]:
df['year'].astype(str)
df['year'] = df['year'].astype(str)

In [22]:
df.info() #object型の変更できた

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   object 
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
 6   iso_alpha  1704 non-null   object 
 7   iso_num    1704 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 106.6+ KB


In [26]:
df['year'] = df['year'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   float64
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
 6   iso_alpha  1704 non-null   object 
 7   iso_num    1704 non-null   int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 106.6+ KB


In [32]:
df.loc[1704, 'year'] = 'abc'

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1705 non-null   object 
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   float64
 5   gdpPercap  1704 non-null   float64
 6   iso_alpha  1704 non-null   object 
 7   iso_num    1704 non-null   float64
dtypes: float64(4), object(4)
memory usage: 106.7+ KB


In [36]:
# 上記でobject型にした状態かつyear列の1704行目にabcを入れた状態で以下をやるとエラー
df['year'] = df['year'].astype(float)

ValueError: could not convert string to float: 'abc'

In [40]:
# to_numericで解釈できなものはnonに変換
df['year'] = pd.to_numeric(df['year'], errors = 'coerce')

In [42]:
df.dtypes #dytypesで変換できたことを確認

country       object
continent     object
year         float64
lifeExp      float64
pop          float64
gdpPercap    float64
iso_alpha     object
iso_num      float64
dtype: object

In [46]:
# object型だけを持ってるデータを抽出
df.select_dtypes(include = 'object')
df.select_dtypes(include = 'object').columns

Index(['country', 'continent', 'iso_alpha'], dtype='object')

In [48]:
# データの定義確認
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap,iso_num
count,1704.0,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081,425.880282
std,17.26533,12.917107,106157900.0,9857.454543,248.305709
min,1952.0,23.599,60011.0,241.165876,4.0
25%,1965.75,48.198,2793664.0,1202.060309,208.0
50%,1979.5,60.7125,7023596.0,3531.846989,410.0
75%,1993.25,70.8455,19585220.0,9325.462346,638.0
max,2007.0,82.603,1318683000.0,113523.1329,894.0


In [50]:
df.drop(1704, inplace=True)

In [52]:
df.tail(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
1699,Zimbabwe,Africa,1987.0,62.351,9216418.0,706.157306,ZWE,716.0
1700,Zimbabwe,Africa,1992.0,60.377,10704340.0,693.420786,ZWE,716.0
1701,Zimbabwe,Africa,1997.0,46.809,11404948.0,792.44996,ZWE,716.0
1702,Zimbabwe,Africa,2002.0,39.989,11926563.0,672.038623,ZWE,716.0
1703,Zimbabwe,Africa,2007.0,43.487,12311143.0,469.709298,ZWE,716.0


In [54]:
# 重複を避けて抽出
df['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Australia', 'Austria', 'Bahrain', 'Bangladesh', 'Belgium',
       'Benin', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Czech Republic',
       'Denmark', 'Djibouti', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Finland', 'France', 'Gabon', 'Gambia', 'Germany', 'Ghana',
       'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Haiti',
       'Honduras', 'Hong Kong, China', 'Hungary', 'Iceland', 'India',
       'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kenya', 'Korea, Dem. Rep.',
       'Korea, Rep.', 'Kuwait', 'Leba

In [56]:
# 重複を避けて抽出(数を抽出)
df['country'].nunique()

142

In [62]:
# 並び替え
df['continent'].value_counts(ascending=True)

continent
Oceania      24
Americas    300
Europe      360
Asia        396
Africa      624
Name: count, dtype: int64

In [68]:
# 特定のカラムについて昇順にする
df.sort_values(by='year')
# 特定のカラムについて降順にする
df.sort_values(by='year', ascending=False)
# 複数のカラムについて降順にする
df.sort_values(by=['year','lifeExp'], ascending=False)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
803,Japan,Asia,2007.0,82.603,127467972.0,31656.068060,JPN,392.0
671,"Hong Kong, China",Asia,2007.0,82.208,6980412.0,39724.978670,HKG,344.0
695,Iceland,Europe,2007.0,81.757,301931.0,36180.789190,ISL,352.0
1487,Switzerland,Europe,2007.0,81.701,7554661.0,37506.419070,CHE,756.0
71,Australia,Oceania,2007.0,81.235,20434176.0,34435.367440,AUS,36.0
...,...,...,...,...,...,...,...,...
1032,Mozambique,Africa,1952.0,31.286,6446316.0,468.526038,MOZ,508.0
1344,Sierra Leone,Africa,1952.0,30.331,2143249.0,879.787736,SLE,694.0
36,Angola,Africa,1952.0,30.015,4232095.0,3520.610273,AGO,24.0
552,Gambia,Africa,1952.0,30.000,284320.0,485.230659,GMB,270.0


In [70]:
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314,AFG,4.0
1,Afghanistan,Asia,1957.0,30.332,9240934.0,820.853030,AFG,4.0
2,Afghanistan,Asia,1962.0,31.997,10267083.0,853.100710,AFG,4.0
3,Afghanistan,Asia,1967.0,34.020,11537966.0,836.197138,AFG,4.0
4,Afghanistan,Asia,1972.0,36.088,13079460.0,739.981106,AFG,4.0
...,...,...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987.0,62.351,9216418.0,706.157306,ZWE,716.0
1700,Zimbabwe,Africa,1992.0,60.377,10704340.0,693.420786,ZWE,716.0
1701,Zimbabwe,Africa,1997.0,46.809,11404948.0,792.449960,ZWE,716.0
1702,Zimbabwe,Africa,2002.0,39.989,11926563.0,672.038623,ZWE,716.0


In [72]:
# df['year']が'1952'のカラムだけを取得
df['year'] == 1952

0        True
1       False
2       False
3       False
4       False
        ...  
1699    False
1700    False
1701    False
1702    False
1703    False
Name: year, Length: 1704, dtype: bool

In [74]:
# df[]の中に格納することで出力可能
df[df['year'] == 1952]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314,AFG,4.0
12,Albania,Europe,1952.0,55.230,1282697.0,1601.056136,ALB,8.0
24,Algeria,Africa,1952.0,43.077,9279525.0,2449.008185,DZA,12.0
36,Angola,Africa,1952.0,30.015,4232095.0,3520.610273,AGO,24.0
48,Argentina,Americas,1952.0,62.485,17876956.0,5911.315053,ARG,32.0
...,...,...,...,...,...,...,...,...
1644,Vietnam,Asia,1952.0,40.412,26246839.0,605.066492,VNM,704.0
1656,West Bank and Gaza,Asia,1952.0,43.160,1030585.0,1515.592329,PSE,275.0
1668,"Yemen, Rep.",Asia,1952.0,32.548,4963829.0,781.717576,YEM,887.0
1680,Zambia,Africa,1952.0,42.038,2672000.0,1147.388831,ZMB,894.0


In [78]:
# gdpPercapの中央値よりもおおきくかつyearが2000年よりもあたらしものを抽出
gdp_per_cap_med = df['gdpPercap'].median()
condition1 = df['gdpPercap'] > gdp_per_cap_med
condition2 = df['year'] >= 2000
df[condition1 & condition2]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
22,Albania,Europe,2002.0,75.651,3508512.0,4604.211737,ALB,8.0
23,Albania,Europe,2007.0,76.423,3600523.0,5937.029526,ALB,8.0
34,Algeria,Africa,2002.0,70.994,31287142.0,5288.040382,DZA,12.0
35,Algeria,Africa,2007.0,72.301,33333216.0,6223.367465,DZA,12.0
47,Angola,Africa,2007.0,42.731,12420476.0,4797.231267,AGO,24.0
...,...,...,...,...,...,...,...,...
1630,Uruguay,Americas,2002.0,75.307,3363085.0,7727.002004,URY,858.0
1631,Uruguay,Americas,2007.0,76.384,3447496.0,10611.462990,URY,858.0
1642,Venezuela,Americas,2002.0,72.766,24287670.0,8605.047831,VEN,862.0
1643,Venezuela,Americas,2007.0,73.747,26084662.0,11415.805690,VEN,862.0


In [82]:
# df.queryにより簡単に抽出
df.query('year >= 2000')
df.query('(year >= 2000) & (continent == "Asia")')

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
10,Afghanistan,Asia,2002.0,42.129,25268405.0,726.734055,AFG,4.0
11,Afghanistan,Asia,2007.0,43.828,31889923.0,974.580338,AFG,4.0
94,Bahrain,Asia,2002.0,74.795,656397.0,23403.559270,BHR,48.0
95,Bahrain,Asia,2007.0,75.635,708573.0,29796.048340,BHR,48.0
106,Bangladesh,Asia,2002.0,62.013,135656790.0,1136.390430,BGD,50.0
...,...,...,...,...,...,...,...,...
1655,Vietnam,Asia,2007.0,74.249,85262356.0,2441.576404,VNM,704.0
1666,West Bank and Gaza,Asia,2002.0,72.370,3389578.0,4515.487575,PSE,275.0
1667,West Bank and Gaza,Asia,2007.0,73.422,4018332.0,3025.349798,PSE,275.0
1678,"Yemen, Rep.",Asia,2002.0,60.308,18701257.0,2234.820827,YEM,887.0


In [84]:
df['continent'].unique()

array(['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'], dtype=object)

In [88]:
# inを使い特定のもののみを抽出
df.query('continent in ["Asia", "Europe", "Africa"]')

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314,AFG,4.0
1,Afghanistan,Asia,1957.0,30.332,9240934.0,820.853030,AFG,4.0
2,Afghanistan,Asia,1962.0,31.997,10267083.0,853.100710,AFG,4.0
3,Afghanistan,Asia,1967.0,34.020,11537966.0,836.197138,AFG,4.0
4,Afghanistan,Asia,1972.0,36.088,13079460.0,739.981106,AFG,4.0
...,...,...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987.0,62.351,9216418.0,706.157306,ZWE,716.0
1700,Zimbabwe,Africa,1992.0,60.377,10704340.0,693.420786,ZWE,716.0
1701,Zimbabwe,Africa,1997.0,46.809,11404948.0,792.449960,ZWE,716.0
1702,Zimbabwe,Africa,2002.0,39.989,11926563.0,672.038623,ZWE,716.0


In [92]:
# 変数を使った代入も可能
target_continets =  ["Asia", "Europe", "Africa"]
df.query('continent in @target_continets')

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952.0,28.801,8425333.0,779.445314,AFG,4.0
1,Afghanistan,Asia,1957.0,30.332,9240934.0,820.853030,AFG,4.0
2,Afghanistan,Asia,1962.0,31.997,10267083.0,853.100710,AFG,4.0
3,Afghanistan,Asia,1967.0,34.020,11537966.0,836.197138,AFG,4.0
4,Afghanistan,Asia,1972.0,36.088,13079460.0,739.981106,AFG,4.0
...,...,...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987.0,62.351,9216418.0,706.157306,ZWE,716.0
1700,Zimbabwe,Africa,1992.0,60.377,10704340.0,693.420786,ZWE,716.0
1701,Zimbabwe,Africa,1997.0,46.809,11404948.0,792.449960,ZWE,716.0
1702,Zimbabwe,Africa,2002.0,39.989,11926563.0,672.038623,ZWE,716.0
