# 基本数据对象及操作

## 1. Series

* 创建Series

In [1]:
import pandas as pd

countries = ['中国', '美国', '澳大利亚']
countries_s = pd.Series(countries)
print(type(countries_s))
print(countries_s)
print(countries_s.index)
print(countries_s.values)

In [2]:
numbers = [4, 5, 6]
numbers_s = pd.Series(numbers)
print(numbers_s)
print(numbers_s.index)
print(numbers_s.values)

In [3]:
country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}

country_dict_s = pd.Series(country_dicts)
# 给索引命名
country_dict_s.index.name = 'Code'
# 给数据命名
country_dict_s.name = 'Country'

print(country_dict_s)
print(country_dict_s.values)
print(country_dict_s.index)

* 处理缺失数据

In [4]:
countries = ['中国', '美国', '澳大利亚', None]
print(pd.Series(countries))

In [5]:
numbers = [4, 5, 6, None]
print(pd.Series(numbers))

* Series 索引

In [6]:
country_dicts = {'CH': '中国',
                'US': '美国',
                'AU': '澳大利亚'}

country_dict_s = pd.Series(country_dicts)
print(country_dict_s)

CH      中国
US      美国
AU    澳大利亚
dtype: object


In [7]:
# 通过索引判断数据是存在
# Series也可看作定长、有序的字典
print('CH' in country_dict_s)
print('NZ' in country_dict_s)

True
False


In [8]:
print('iloc:', country_dict_s.iloc[1])
print('loc:', country_dict_s.loc['US'])
print('[]:', country_dict_s['US'])

iloc: 美国
loc: 美国
[]: 美国


In [9]:
print('iloc:\n', country_dict_s.iloc[ [0, 2] ])
print()
print('loc:\n', country_dict_s.loc[['US', 'AU']])

iloc:
 CH      中国
AU    澳大利亚
dtype: object

loc:
 US      美国
AU    澳大利亚
dtype: object


*  向量化操作

In [10]:
import numpy as np

s = pd.Series(np.random.randint(0, 1000, 10000))
print(s.head())
print(len(s))

0    762
1    725
2    979
3    424
4    796
dtype: int32
10000


In [11]:
%%timeit -n 100
total = 0
for item in s:
    total += item

488 µs ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit -n 100
total = np.sum(s)

86.1 µs ± 4.93 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
for label, value in s.iteritems():
    s.loc[label] = value + 2

4.81 s ± 1.18 s per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s += 2

The slowest run took 6.74 times longer than the fastest. This could mean that an intermediate result is being cached.
353 µs ± 373 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 2. DataFrame

* 创建Dataframe

In [38]:
import pandas as pd

country1 = pd.Series({'Name': '中国',
                    'Language': 'Chinese',
                    'Area': '9.597M km2',
                     'Happiness Rank': 79})

country2 = pd.Series({'Name': '美国',
                    'Language': 'English (US)',
                    'Area': '9.834M km2',
                     'Happiness Rank': 14})

country3 = pd.Series({'Name': '澳大利亚',
                    'Language': 'English (AU)',
                    'Area': '7.692M km2',
                     'Happiness Rank': 9})

df = pd.DataFrame([country1, country2, country3], index=['CH', 'US', 'AU'])

In [39]:
# 注意在jupyter中使用print和不使用print的区别
print(df)
df

    Name      Language        Area  Happiness Rank
CH    中国       Chinese  9.597M km2              79
US    美国  English (US)  9.834M km2              14
AU  澳大利亚  English (AU)  7.692M km2               9


Unnamed: 0,Name,Language,Area,Happiness Rank
CH,中国,Chinese,9.597M km2,79
US,美国,English (US),9.834M km2,14
AU,澳大利亚,English (AU),7.692M km2,9


In [40]:
# 添加数据
# 如果个数小于要求的个数，会自动进行“广播”操作
# 如果大于要求的个数，会报错
df['Location'] = '地球'
print(df)

df['Region'] = ['亚洲', '北美洲', '大洋洲']
# print(df)
df

    Name      Language        Area  Happiness Rank Location
CH    中国       Chinese  9.597M km2              79       地球
US    美国  English (US)  9.834M km2              14       地球
AU  澳大利亚  English (AU)  7.692M km2               9       地球


Unnamed: 0,Name,Language,Area,Happiness Rank,Location,Region
CH,中国,Chinese,9.597M km2,79,地球,亚洲
US,美国,English (US),9.834M km2,14,地球,北美洲
AU,澳大利亚,English (AU),7.692M km2,9,地球,大洋洲


* Dataframe索引

In [41]:
# 行索引
print('loc:')
print(df.loc['CH'])
print(type(df.loc['CH']))

print('iloc:')
print(df.iloc[1])

loc:
Name                      中国
Language             Chinese
Area              9.597M km2
Happiness Rank            79
Location                  地球
Region                    亚洲
Name: CH, dtype: object
<class 'pandas.core.series.Series'>
iloc:
Name                        美国
Language          English (US)
Area                9.834M km2
Happiness Rank              14
Location                    地球
Region                     北美洲
Name: US, dtype: object
CH    9.597M km2
US    9.834M km2
AU    7.692M km2
Name: Area, dtype: object
<class 'pandas.core.series.Series'>


In [42]:
# 列索引
print(df['Area'])
print(type(df['Area']))

CH    9.597M km2
US    9.834M km2
AU    7.692M km2
Name: Area, dtype: object
<class 'pandas.core.series.Series'>


In [20]:
# 获取不连续的列数据
print(df[['Name', 'Area']])

    Name        Area
CH    中国  9.597M km2
US    美国  9.834M km2
AU  澳大利亚  7.692M km2


In [21]:
# 混合索引
# 注意写法上的区别
print('先取出列，再取行：')
print(df['Area']['CH'])
print(df['Area'].loc['CH'])
print(df['Area'].iloc[0])

print('先取出行，再取列：')
print(df.loc['CH']['Area'])
print(df.iloc[0]['Area'])

先取出列，再取行：
9.597M km2
9.597M km2
9.597M km2
先取出行，再取列：
9.597M km2
9.597M km2


In [22]:
# 转换行和列
print(df.T)

                        CH            US            AU
Name                    中国            美国          澳大利亚
Language           Chinese  English (US)  English (AU)
Area            9.597M km2    9.834M km2    7.692M km2
Happiness Rank          79            14             9
Location                地球            地球            地球
Region                  亚洲           北美洲           大洋洲


* 删除数据

In [23]:
print(df.drop(['CH']))
# 注意drop操作只是将修改后的数据copy一份，而不会对原始数据进行修改
print(df)

    Name      Language        Area  Happiness Rank Location Region
US    美国  English (US)  9.834M km2              14       地球    北美洲
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲
    Name      Language        Area  Happiness Rank Location Region
CH    中国       Chinese  9.597M km2              79       地球     亚洲
US    美国  English (US)  9.834M km2              14       地球    北美洲
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲


In [24]:
print(df.drop(['CH'], inplace=True))
# 如果使用了inplace=True，会在原始数据上进行修改，同时不会返回一个copy
print(df)

None
    Name      Language        Area  Happiness Rank Location Region
US    美国  English (US)  9.834M km2              14       地球    北美洲
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲


In [25]:
#  如果需要删除列，需要指定axis=1
print(df.drop(['Area'], axis=1))
print(df)

    Name      Language  Happiness Rank Location Region
US    美国  English (US)              14       地球    北美洲
AU  澳大利亚  English (AU)               9       地球    大洋洲
    Name      Language        Area  Happiness Rank Location Region
US    美国  English (US)  9.834M km2              14       地球    北美洲
AU  澳大利亚  English (AU)  7.692M km2               9       地球    大洋洲


In [26]:
# 也可直接使用del关键字
del df['Name']
print(df)

        Language        Area  Happiness Rank Location Region
US  English (US)  9.834M km2              14       地球    北美洲
AU  English (AU)  7.692M km2               9       地球    大洋洲


* DataFrame的操作与加载

In [27]:
df['Happiness Rank']

US    14
AU     9
Name: Happiness Rank, dtype: int64

In [28]:
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
ranks = df['Happiness Rank']
ranks += 2
print(ranks)
print(df)

US    16
AU    11
Name: Happiness Rank, dtype: int64
        Language        Area  Happiness Rank Location Region
US  English (US)  9.834M km2              16       地球    北美洲
AU  English (AU)  7.692M km2              11       地球    大洋洲


In [29]:
# 注意从DataFrame中取出的数据进行操作后，会对原始数据产生影响
# 安全的操作是使用copy()
ranks = df['Happiness Rank'].copy()
ranks += 2
print(ranks)
print(df)

US    18
AU    13
Name: Happiness Rank, dtype: int64
        Language        Area  Happiness Rank Location Region
US  English (US)  9.834M km2              16       地球    北美洲
AU  English (AU)  7.692M km2              11       地球    大洋洲


In [30]:
# 加载csv文件数据
reprot_2015_df = pd.read_csv('./2015.csv')
print('2015年数据预览：')
#print(reprot_2015_df.head())
reprot_2015_df.head()

2015年数据预览：


Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [31]:
print(reprot_2015_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
Country                          158 non-null object
Region                           158 non-null object
Happiness Rank                   158 non-null int64
Happiness Score                  158 non-null float64
Standard Error                   158 non-null float64
Economy (GDP per Capita)         158 non-null float64
Family                           158 non-null float64
Health (Life Expectancy)         158 non-null float64
Freedom                          158 non-null float64
Trust (Government Corruption)    158 non-null float64
Generosity                       158 non-null float64
Dystopia Residual                158 non-null float64
dtypes: float64(9), int64(1), object(2)
memory usage: 14.9+ KB
None


## 3. 索引

In [32]:
# 使用index_col指定索引列
# 使用usecols指定需要读取的列
reprot_2016_df = pd.read_csv('./2016.csv', 
                             index_col='Country',
                             usecols=['Country', 'Happiness Rank', 'Happiness Score', 'Region'])
# 数据预览
reprot_2016_df.head()

Unnamed: 0_level_0,Region,Happiness Rank,Happiness Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Denmark,Western Europe,1,7.526
Switzerland,Western Europe,2,7.509
Iceland,Western Europe,3,7.501
Norway,Western Europe,4,7.498
Finland,Western Europe,5,7.413


In [33]:
print('列名(column)：', reprot_2016_df.columns)
print('行名(index)：', reprot_2016_df.index)

列名(column)： Index(['Region', 'Happiness Rank', 'Happiness Score'], dtype='object')
行名(index)： Index(['Denmark', 'Switzerland', 'Iceland', 'Norway', 'Finland', 'Canada',
       'Netherlands', 'New Zealand', 'Australia', 'Sweden',
       ...
       'Madagascar', 'Tanzania', 'Liberia', 'Guinea', 'Rwanda', 'Benin',
       'Afghanistan', 'Togo', 'Syria', 'Burundi'],
      dtype='object', name='Country', length=157)


In [34]:
# 注意index是不可变的
#reprot_2016_df.index[0] = '丹麦'

TypeError: Index does not support mutable operations

In [None]:
# 重置index
# 注意inplace加与不加的区别
reprot_2016_df.reset_index(inplace=True)

In [None]:
reprot_2016_df.head()

In [None]:
# 重命名列名
reprot_2016_df = reprot_2016_df.rename(columns={'Region': '地区', 'Hapiness Rank': '排名', 'Hapiness Score': '幸福指数'})
reprot_2016_df.head()

In [None]:
# 重命名列名，注意inplace的使用
reprot_2016_df.rename(columns={'Region': '地区', 'Happiness Rank': '排名', 'Happiness Score': '幸福指数'},
                     inplace=True)
reprot_2016_df.head()

## 4. Boolean Mask

In [None]:
reprot_2016_df.head()

In [None]:
# 过滤 Western Europe 地区的国家
# only_western_europe = reprot_2016_df['地区'] == 'Western Europe'
reprot_2016_df[reprot_2016_df['地区'] == 'Western Europe']

In [None]:
# 过滤 Western Europe 地区的国家
# 并且排名在10之外
only_western_europe_10 = (reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)
only_western_europe_10

In [None]:
# 叠加 boolean mask 得到最终结果
reprot_2016_df[only_western_europe_10]

In [None]:
# 熟练以后可以写在一行中
reprot_2016_df[(reprot_2016_df['地区'] == 'Western Europe') & (reprot_2016_df['排名'] > 10)]

## 5. 层级索引

In [None]:
reprot_2015_df.head()

In [None]:
# 设置层级索引
report_2015_df2 = reprot_2015_df.set_index(['Region', 'Country'])
report_2015_df2.head(20)

In [None]:
# level0 索引
report_2015_df2.loc['Western Europe']

In [None]:
# 两层索引
report_2015_df2.loc['Western Europe', 'Switzerland']

In [None]:
# 交换分层顺序
report_2015_df2.swaplevel()

In [None]:
# 排序分层
report_2015_df2.sort_index(level=0)