In [1]:
#pandas入门

#导入相关模板库
import os
import numpy as np
import pandas as pd

In [2]:
#定义函数（如log_head显示数据集前10行）
def log_head(df, head_count=10):
    print(df.head(head_count))
    return df

def log_columns(df):
    print(df.columns)
    return df

def log_shape(df):
    print(f'shape = {df.shape}')
    return df

In [3]:
#导入数据集并进行显示
df = (pd.read_csv(filepath_or_buffer='./data/master.csv')
      .pipe(log_head, head_count=1)
      .rename(columns={'suicides/100k pop' : 'suicides_per_100k',
                       ' gdp_for_year ($) ' : 'gdp_year', 
                       'gdp_per_capita ($)' : 'gdp_capita',
                       'country-year' : 'country_year'})
      .assign(gdp_year=lambda _df: _df['gdp_year'].str.replace(',','').astype(np.int64))
     )

#从头显示10行
df.head(10)

   country  year   sex          age  suicides_no  population  \
0  Albania  1987  male  15-24 years           21      312900   

   suicides/100k pop country-year  HDI for year  gdp_for_year ($)   \
0               6.71  Albania1987           NaN      2,156,624,900   

   gdp_per_capita ($)    generation  
0                 796  Generation X  


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
5,Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
6,Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
7,Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
8,Albania,1987,male,55-74 years,1,137500,0.73,Albania1987,,2156624900,796,G.I. Generation
9,Albania,1987,female,5-14 years,0,311000,0.0,Albania1987,,2156624900,796,Generation X


In [4]:
#获取列索引信息
df.columns

Index(['country', 'year', 'sex', 'age', 'suicides_no', 'population',
       'suicides_per_100k', 'country_year', 'HDI for year', 'gdp_year',
       'gdp_capita', 'generation'],
      dtype='object')

In [5]:
#获取df的size
df.shape

(27820, 12)

In [6]:
#获取country列不同值的个数
df['country'].nunique()

101

In [7]:
#获取country列的前十个不重复值
df['country'].unique()[:10]

array(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain'],
      dtype=object)

In [8]:
#获取generation列不重复的字段
df['generation'].unique()

array(['Generation X', 'Silent', 'G.I. Generation', 'Boomers',
       'Millenials', 'Generation Z'], dtype=object)

In [9]:
#获取数据集的相关统计信息
df.describe(include='all')

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
count,27820,27820.0,27820,27820,27820.0,27820.0,27820.0,27820,8364.0,27820.0,27820.0,27820
unique,101,,2,6,,,,2321,,,,6
top,Iceland,,male,25-34 years,,,,Kazakhstan1993,,,,Generation X
freq,382,,13910,4642,,,,12,,,,6408
mean,,2001.258375,,,242.574407,1844794.0,12.816097,,0.776601,445581000000.0,16866.464414,
std,,8.469055,,,902.047917,3911779.0,18.961511,,0.093367,1453610000000.0,18887.576472,
min,,1985.0,,,0.0,278.0,0.0,,0.483,46919620.0,251.0,
25%,,1995.0,,,3.0,97498.5,0.92,,0.713,8985353000.0,3447.0,
50%,,2002.0,,,25.0,430150.0,5.99,,0.779,48114690000.0,9372.0,
75%,,2008.0,,,131.0,1486143.0,16.62,,0.855,260202400000.0,24874.0,


In [10]:
#获取数据类型
df.dtypes

country               object
year                   int64
sex                   object
age                   object
suicides_no            int64
population             int64
suicides_per_100k    float64
country_year          object
HDI for year         float64
gdp_year               int64
gdp_capita             int64
generation            object
dtype: object

In [11]:
#查看内存消耗
def mem_usage(df):
    return f'{df.memory_usage(deep=True).sum() / 1024 ** 2 : 3.2f} MB'

def convert_df(df):
    return df.astype({
        col: 'category' for col in df.columns 
        if df[col].nunique() / df[col].shape[0] < 0.5})

print(mem_usage(df))
print(mem_usage(df.set_index(['country', 'year', 'sex', 'age'])))
print(mem_usage(convert_df(df)))
print(mem_usage(convert_df(df.set_index(['country', 'year', 'sex', 'age']))))

 10.28 MB
 5.01 MB
 1.40 MB
 1.40 MB


In [12]:
%%time
#创建索引
mi_df = df.set_index(['country', 'year', 'sex', 'age']).sort_index()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 11.1 ms


In [13]:
#查询信息
mi_df.query('country == "Albania" & year == 1987').head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
country,year,sex,age,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
Albania,1987,female,5-14 years,0,311000,0.0,Albania1987,,2156624900,796,Generation X
Albania,1987,female,55-74 years,0,144600,0.0,Albania1987,,2156624900,796,G.I. Generation
Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
Albania,1987,male,5-14 years,0,338200,0.0,Albania1987,,2156624900,796,Generation X


In [14]:
%%time
#查询信息
df.query('country == "Albania" and year == 1987 and sex == "male" and age == "25-34 years"')

CPU times: user 4.64 ms, sys: 355 µs, total: 5 ms
Wall time: 4.03 ms


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [15]:
%%time
#根据索引查询信息
mi_df.loc['Albania', 1987, 'male', '25-34 years']

CPU times: user 1.48 ms, sys: 820 µs, total: 2.3 ms
Wall time: 1.66 ms


suicides_no                    9
population                274300
suicides_per_100k           3.28
country_year         Albania1987
HDI for year                 NaN
gdp_year              2156624900
gdp_capita                   796
generation               Boomers
Name: (Albania, 1987, male, 25-34 years), dtype: object

In [16]:
%%time
#查询信息
mi_df.query('country == "Albania" and year == 1987 and sex == "male" and age == "25-34 years"')

CPU times: user 5.87 ms, sys: 0 ns, total: 5.87 ms
Wall time: 5.01 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
country,year,sex,age,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [17]:
%%time
#索引排序
mi_df.sort_index().head(10)

CPU times: user 4.74 ms, sys: 0 ns, total: 4.74 ms
Wall time: 4.12 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
country,year,sex,age,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
Albania,1987,female,25-34 years,4,257200,1.56,Albania1987,,2156624900,796,Boomers
Albania,1987,female,35-54 years,6,278800,2.15,Albania1987,,2156624900,796,Silent
Albania,1987,female,5-14 years,0,311000,0.0,Albania1987,,2156624900,796,Generation X
Albania,1987,female,55-74 years,0,144600,0.0,Albania1987,,2156624900,796,G.I. Generation
Albania,1987,female,75+ years,1,35600,2.81,Albania1987,,2156624900,796,G.I. Generation
Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers
Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
Albania,1987,male,5-14 years,0,338200,0.0,Albania1987,,2156624900,796,Generation X


In [18]:
#查看排序后的索引各元素均大于前一个元素
df.index.is_monotonic

True

In [19]:
#查看排序后的索引各元素均大于前一个元素
mi_df.index.is_monotonic

True

In [20]:
# Indexes are not necessarily unique keys
(pd.DataFrame({'a':range(2), 'b': range(2)}, index=['a', 'a'])
 .loc['a']
)

Unnamed: 0,a,b
a,0,0
a,1,1


In [21]:
%%time
# 单组聚合
(df
 .groupby('age')
 .agg({'generation':'unique'})
 .rename(columns={'generation':'unique_generation'})
)

CPU times: user 4.89 ms, sys: 0 ns, total: 4.89 ms
Wall time: 4.64 ms


Unnamed: 0_level_0,unique_generation
age,Unnamed: 1_level_1
15-24 years,"[Generation X, Millenials]"
25-34 years,"[Boomers, Generation X, Millenials]"
35-54 years,"[Silent, Boomers, Generation X]"
5-14 years,"[Generation X, Millenials, Generation Z]"
55-74 years,"[G.I. Generation, Silent, Boomers]"
75+ years,"[G.I. Generation, Silent]"


In [22]:
%%time
# 单组聚合
(mi_df
 .groupby('age')
 .agg({'generation':'unique'})
 .rename(columns={'generation':'unique_generation'})
)

CPU times: user 4.83 ms, sys: 446 µs, total: 5.28 ms
Wall time: 4.38 ms


Unnamed: 0_level_0,unique_generation
age,Unnamed: 1_level_1
15-24 years,"[Generation X, Millenials]"
25-34 years,"[Boomers, Generation X, Millenials]"
35-54 years,"[Silent, Boomers, Generation X]"
5-14 years,"[Generation X, Millenials, Generation Z]"
55-74 years,"[G.I. Generation, Silent, Boomers]"
75+ years,"[G.I. Generation, Silent]"


In [23]:
%%time
#多组聚合
(df
 .groupby(['country', 'year'])
 .agg({'suicides_per_100k': 'sum'})
 .rename(columns={'suicides_per_100k':'suicides_sum'})
 .sort_values('suicides_sum', ascending=False)
 .head(10)
)

CPU times: user 4.5 ms, sys: 0 ns, total: 4.5 ms
Wall time: 3.64 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,suicides_sum
country,year,Unnamed: 2_level_1
Lithuania,1995,639.3
Lithuania,1996,595.61
Hungary,1991,575.0
Lithuania,2000,571.8
Hungary,1992,570.26
Lithuania,2001,568.98
Russian Federation,1994,567.64
Lithuania,1998,566.36
Lithuania,1997,565.44
Lithuania,1999,561.53


In [24]:
%%time
#多组聚合
(df
 .groupby(['country', 'year'])
 .agg({'suicides_per_100k': 'sum'})
 .rename(columns={'suicides_per_100k':'suicides_sum'})
).nlargest(10, columns='suicides_sum')

CPU times: user 4.2 ms, sys: 103 µs, total: 4.3 ms
Wall time: 3.77 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,suicides_sum
country,year,Unnamed: 2_level_1
Lithuania,1995,639.3
Lithuania,1996,595.61
Hungary,1991,575.0
Lithuania,2000,571.8
Hungary,1992,570.26
Lithuania,2001,568.98
Russian Federation,1994,567.64
Lithuania,1998,566.36
Lithuania,1997,565.44
Lithuania,1999,561.53


In [25]:
#多组聚合
(mi_df
 .loc[('Switzerland', 2000)]
 .unstack('sex')
 [['suicides_no', 'population']]
)

Unnamed: 0_level_0,suicides_no,suicides_no,population,population
sex,female,male,female,male
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
15-24 years,20,79,410136,426957
25-34 years,47,147,537823,530378
35-54 years,124,360,1072711,1094229
5-14 years,1,4,412273,436831
55-74 years,128,239,723750,649009
75+ years,79,152,330903,184589


In [26]:
def log_head(df, head_count=10):
    print(df.head(head_count))
    return df

def log_columns(df):
    print(df.columns)
    return df

def log_shape(df):
    print(f'shape = {df.shape}')
    return df

In [27]:
import re

In [28]:
%%time

# 添加列
(df
 .assign(valid_cy=lambda _serie: _serie.apply(
     lambda _row: re.split(r'(?=\d{4})', _row['country_year'])[1] == str(_row['year']),
     axis=1))
 .query('valid_cy == False')
 .pipe(log_shape)
)

shape = (0, 13)
CPU times: user 226 ms, sys: 4.66 ms, total: 231 ms
Wall time: 230 ms


Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation,valid_cy


In [29]:
df.head(5)

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country_year,HDI for year,gdp_year,gdp_capita,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [30]:
%%time
# which year, per country, had the most suicides among females
(df
 .pipe(log_shape)
 .query('sex == "female"')
 .groupby(['year', 'country'])
 .agg({'suicides_per_100k':'sum'})
 .pipe(log_shape)
 .rename(columns={'suicides_per_100k':'sum_suicides_per_100k_female'})
 .nlargest(n=10, columns=['sum_suicides_per_100k_female'])
)

shape = (27820, 12)
shape = (2321, 1)
CPU times: user 7.11 ms, sys: 0 ns, total: 7.11 ms
Wall time: 6.21 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,sum_suicides_per_100k_female
year,country,Unnamed: 2_level_1
2009,Republic of Korea,170.89
1989,Singapore,163.16
1986,Singapore,161.67
2010,Republic of Korea,158.52
2007,Republic of Korea,149.6
2011,Republic of Korea,147.84
1991,Hungary,147.35
2008,Republic of Korea,147.04
2000,Aruba,146.22
2005,Republic of Korea,145.35


In [31]:
%%time
# which year, per country, had the most suicides among females
(mi_df
 .query('sex == "female"')
 .groupby(['year', 'country'])
 .agg({'suicides_per_100k':'sum'})
 .rename(columns={'suicides_per_100k':'sum_suicides_per_100k_female'})
 .nlargest(n=10, columns=['sum_suicides_per_100k_female'])
)

CPU times: user 7.96 ms, sys: 0 ns, total: 7.96 ms
Wall time: 6.98 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,sum_suicides_per_100k_female
year,country,Unnamed: 2_level_1
2009,Republic of Korea,170.89
1989,Singapore,163.16
1986,Singapore,161.67
2010,Republic of Korea,158.52
2007,Republic of Korea,149.6
2011,Republic of Korea,147.84
1991,Hungary,147.35
2008,Republic of Korea,147.04
2000,Aruba,146.22
2005,Republic of Korea,145.35


In [32]:
from sklearn.preprocessing import MinMaxScaler

def norm_df(df, columns):
    return df.assign(**{col: MinMaxScaler().fit_transform(df[[col]].values.astype(float)) 
                        for col in columns})

In [33]:
# decrease in gdp correlated with increase in suicide number (is it gender related?)
for sex in ['male', 'female']:
    print(sex)
    print(
        df
        .query(f'sex == "{sex}"')
        .groupby(['country'])
        .agg({'suicides_per_100k': 'sum', 'gdp_year': 'mean'})
        .rename(columns={'suicides_per_100k':'suicides_per_100k_sum', 
                         'gdp_year': 'gdp_year_mean'})
        .pipe(norm_df, columns=['suicides_per_100k_sum', 'gdp_year_mean'])
        .corr(method='spearman')
    )
    print('\n')

male
                       suicides_per_100k_sum  gdp_year_mean
suicides_per_100k_sum               1.000000       0.421218
gdp_year_mean                       0.421218       1.000000


female
                       suicides_per_100k_sum  gdp_year_mean
suicides_per_100k_sum               1.000000       0.452343
gdp_year_mean                       0.452343       1.000000




In [34]:
%%time
#遍历行
for row in df.iterrows(): continue

CPU times: user 1.09 s, sys: 3.96 ms, total: 1.09 s
Wall time: 1.09 s


In [35]:
%%time
#更高效地遍历行
for tup in df.itertuples(): continue

CPU times: user 25.3 ms, sys: 259 µs, total: 25.6 ms
Wall time: 25.1 ms


In [36]:
#查看行的数据类型
for tup in df.itertuples(): print(type(row));break

<class 'tuple'>
