# Pandas study from scratch

## 1. Load related library

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')  # ignore warning message

import os
os.getcwd() # current working directory
os.chdir("C:\\dataAnalysis\\philosophy_datascience\\04.Pandas")

## 2. data load

In [2]:
data = pd.read_csv("data/gapminder.tsv", sep = '\t')

## 3. data glancing.
 * 1704 rows , 6 columns

In [3]:
data.shape, type(data.shape)

((1704, 6), tuple)

In [4]:
number_of_rows = data.shape[0]
print(number_of_rows)

1704


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


In [6]:
data.head(n = 5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [7]:
data.tail(n = 5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [8]:
data.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165877
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846989
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


In [9]:
data.index, data.columns
print(data.index)
print(data.columns)

RangeIndex(start=0, stop=1704, step=1)
Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')


### 3.1 Pandas datatpye v.s Python
* `object` v.s `str`
* `int64` v.s `int`
* `float64` v.s `float`
*  `datetime64` v.s `datetime`

### 4. Data Extraction

In [10]:
country_df = data['country'] # country_Df = data.country
type(country_df)

pandas.core.series.Series

In [11]:
[api for api in dir(country_df) if not api.startswith("_")]

['T',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'attrs',
 'autocorr',
 'axes',
 'between',
 'between_time',
 'bfill',
 'bool',
 'clip',
 'combine',
 'combine_first',
 'convert_dtypes',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'explode',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'floordiv',
 'ge',
 'get',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'interpolate',
 'is_monotonic',
 'is_monotonic_decreasing',
 'is_monotonic_increasing',
 'is_unique',
 'isin',
 'isna',
 'isnull',


In [12]:
subset_df = data[['country', 'continent', 'year']]
print(type(subset_df))
subset_df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


### 4.1 row extraction by row-based with loc()

In [13]:
help(subset_df.loc)

Help on _LocIndexer in module pandas.core.indexing object:

class _LocIndexer(_LocationIndexer)
 |  Access a group of rows and columns by label(s) or a boolean array.
 |  
 |  ``.loc[]`` is primarily label based, but may also be used with a
 |  boolean array.
 |  
 |  Allowed inputs are:
 |  
 |  - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
 |    interpreted as a *label* of the index, and **never** as an
 |    integer position along the index).
 |  - A list or array of labels, e.g. ``['a', 'b', 'c']``.
 |  - A slice object with labels, e.g. ``'a':'f'``.
 |  
 |        start and the stop are included
 |  
 |  - A boolean array of the same length as the axis being sliced,
 |    e.g. ``[True, False, True]``.
 |  - A ``callable`` function with one argument (the calling Series or
 |    DataFrame) and that returns valid output for indexing (one of the above)
 |  
 |  See more at :ref:`Selection by Label <indexing.label>`
 |  
 |  Raises
 |  ------
 |  KeyError
 |      If any 




In [14]:
df = pd.DataFrame(data = [[1,2], [4, 5], [7,8]],
                  index = ['corba', 'viper', 'sidewinder'],
                  columns = ['max_speed', 'shield']
                 )
df

Unnamed: 0,max_speed,shield
corba,1,2
viper,4,5
sidewinder,7,8


In [15]:
# df.loc[0]  # TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0] of <class 'int'>
type(df.loc['viper']), df.loc['viper'], type(df.loc[['viper']])

(pandas.core.series.Series,
 max_speed    4
 shield       5
 Name: viper, dtype: int64,
 pandas.core.frame.DataFrame)

In [16]:
df.loc[['viper', 'sidewinder']]  # label-based indexing

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [17]:
df.loc['viper': 'sidewinder']

Unnamed: 0,max_speed,shield
viper,4,5
sidewinder,7,8


In [18]:
df.loc['viper': 'sidewinder', 'max_speed']

viper         4
sidewinder    7
Name: max_speed, dtype: int64

In [19]:
data.index

RangeIndex(start=0, stop=1704, step=1)

In [20]:
data.loc[:, ['country', 'continent', 'year', 'lifeExp']] # 모든 행, 지정된 컬럼

Unnamed: 0,country,continent,year,lifeExp
0,Afghanistan,Asia,1952,28.801
1,Afghanistan,Asia,1957,30.332
2,Afghanistan,Asia,1962,31.997
3,Afghanistan,Asia,1967,34.020
4,Afghanistan,Asia,1972,36.088
...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351
1700,Zimbabwe,Africa,1992,60.377
1701,Zimbabwe,Africa,1997,46.809
1702,Zimbabwe,Africa,2002,39.989


In [21]:
subset = data.loc[0:10, ['year', 'pop']]
subset

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
5,1977,14880372
6,1982,12881816
7,1987,13867957
8,1992,16317921
9,1997,22227415


In [22]:
data.loc[[0, 99]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086


In [23]:
small_range  = range(3)
subset_range = data.iloc[:, small_range]
subset_range.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [24]:
subset_range = data.iloc[0:9,1:5]
subset_range.head(n = 10)

Unnamed: 0,continent,year,lifeExp,pop
0,Asia,1952,28.801,8425333
1,Asia,1957,30.332,9240934
2,Asia,1962,31.997,10267083
3,Asia,1967,34.02,11537966
4,Asia,1972,36.088,13079460
5,Asia,1977,38.438,14880372
6,Asia,1982,39.854,12881816
7,Asia,1987,40.822,13867957
8,Asia,1992,41.674,16317921


#### make sure that stop point will be included in result set.

In [25]:
data.loc[0:99 ,]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
95,Bahrain,Asia,2007,75.635,708573,29796.048340
96,Bangladesh,Asia,1952,37.484,46886859,684.244172
97,Bangladesh,Asia,1957,39.348,51365468,661.637458
98,Bangladesh,Asia,1962,41.216,56839289,686.341554


#### check this out to fetch the last one

In [26]:
nrows, ncols = data.shape
print("%d * %d "% (nrows, ncols))

the_last = data.loc[nrows - 1]
print(the_last)
print(data.tail(n = 3))

1704 * 6 
country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object
       country continent  year  lifeExp       pop   gdpPercap
1701  Zimbabwe    Africa  1997   46.809  11404948  792.449960
1702  Zimbabwe    Africa  2002   39.989  11926563  672.038623
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298


In [27]:
print(data.loc[[0, 9, 99]])

        country continent  year  lifeExp       pop   gdpPercap
0   Afghanistan      Asia  1952   28.801   8425333  779.445314
9   Afghanistan      Asia  1997   41.763  22227415  635.341351
99   Bangladesh      Asia  1967   43.453  62821884  721.186086


#### loc returns Series data type otherwhile tail returns DataFrame.

In [28]:
subset_loc = data.loc[99]
subset_tail = data.tail(n = 2)

type(subset_loc), type(subset_tail)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

### 4.2 iloc 
* iloc : index-based location
* end-point exclude in result set.

In [29]:
data.head(n = 10)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


In [30]:
data.iloc[2:4]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138


In [31]:
df

Unnamed: 0,max_speed,shield
corba,1,2
viper,4,5
sidewinder,7,8


In [32]:
df.iloc['corba']  # TypeError: Cannot index by location index with a non-integer key

TypeError: Cannot index by location index with a non-integer key

In [None]:
df.iloc[0], df.iloc[0:2], df.iloc[[0,2]]

In [None]:
data.iloc[[-1, -2, -3]], type(data.iloc[[-1, -2, -3]])

In [None]:
data.iloc[:, 0:3]  # fetch colums from 0 index to 2 index

### 5. aggregation 

#### 5.1 group by single column or multiple columns.

In [None]:
data.head()

In [None]:
data.groupby('year')[['continent', 'lifeExp', 'gdpPercap']].mean().sort_values(by = 'year', ascending = True)

In [None]:
data.groupby(['year', 'continent'])[['lifeExp', 'gdpPercap']].mean().sort_values(by = 'year', ascending = False)

In [None]:
grouped_year_df = data.groupby('year')
print(grouped_year_df)
grouped_year_df[['continent', 'year', 'lifeExp', 'gdpPercap',]].head()

#### 5.2 nunique
* nunique - 중복이 없는 총 갯수

In [None]:
data.groupby('continent')['country'].nunique()

### 6. simple visualization

In [None]:
global_yearly_life_exp = data.groupby('year')['lifeExp'].mean()
global_yearly_life_exp.head()

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt

##### 년도별 평균수명에 대한 그래프 

In [None]:
global_yearly_life_exp.plot()

## 7.시리즈와 데이터프레임 직접 만들기
 * 데이터프레임과 시리즈는 리스트나 딕셔너리와 비슷하지만 데이터를 다루는데 더 특화되어있다.

#### 7.1 make Series

In [None]:
import pandas as pd

s = pd.Series(['apple', 33])
print(s)

In [None]:
s = pd.Series(['Jane', 'Student'])
print(s)

#### 7.2 set index using sequnece of string

In [None]:
s = pd.Series(data = ['Jane', 'student'], index = ['Person', 'Job'])
print(s, type(s), s.index)

In [None]:
df = pd.DataFrame(s)
df.head()

#### 7.3 DataFrame creation using Dictionary
* 데이터프레임의 컬럼은 모두 Series이며, 데이터의 순서를 보장하지 않음

In [None]:
scientists = pd.DataFrame(
    {
        'Name': ['Rosaline', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-01'],
        'Age' : [37, 61]
    }
)
scientists

In [None]:
scientists = pd.DataFrame(
    {
        'Name': ['Rosaline', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-01'],
        'Age' : [37, 61]
    },
    index = ['Rodsline', 'Willian Gosset'],
   columns = ['Occupation', 'Born', 'Died', 'Age']
)
scientists

* OrderedDict()클래스를 이용한 데이터프레임 정렬

In [None]:
from collections import OrderedDict
scientists = pd.DataFrame(
    OrderedDict({
        'Name': ['Rosaline', 'William Gosset'],
        'Occupation': ['Chemist', 'Statistician'],
        'Born': ['1920-07-25', '1876-06-13'],
        'Died': ['1958-04-16', '1937-10-01'],
        'Age' : [37, 61]
    }),
    index = ['Rosaline', 'William Gosset'],
   columns = ['Name', 'Occupation', 'Born', 'Died', 'Age']
)
scientists

In [None]:
first_row = scientists.loc["William Gosset"]
print(first_row, type(first_row))

#### 7.4 Index, values, keys() API

In [None]:
print(first_row.index, type(first_row.index))

In [None]:
print(first_row.keys())

In [None]:
print(first_row.values)

In [None]:
print(type(first_row.keys()), first_row.keys()[0])  

#### 7.5 시리즈와 불린 추출하기 

In [38]:
scientist = pd.read_csv("data/scientists.csv")
scientist.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [46]:
ages = scientist['Age']
print(ages, type(ages))
print('최고령 : ', ages.max())
print('최연소 나이: ', ages.min())

print(scientist[scientist['Age']== ages.max()])
print(scientist[scientist['Age']== ages.min()])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64 <class 'pandas.core.series.Series'>
최고령 :  90
최연소 나이:  37
                   Name        Born        Died  Age Occupation
2  Florence Nightingale  1820-05-12  1910-08-13   90      Nurse
                Name        Born        Died  Age Occupation
0  Rosaline Franklin  1920-07-25  1958-04-16   37    Chemist


In [51]:
age_mean = np.round(ages.mean(), decimals = 0)

#### 평균 나이보다 오래 산 과학자는?

In [53]:
scientist[scientist['Age'] > age_mean]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


## 8.broadcasting
* shape이 맞지않으면 맞는 쌍은 정상 계산되나, 맞지않는 쌍은 NaN값 처리됨.

In [58]:
x = pd.Series([1, 100])
print(ages.shape, x.shape)
print(ages + pd.Series([1, 100]))

(8,) (2,)
0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


##### 주요 파라미터 설명
* sort_inde(axis=0,ascending=True,inplace=False,ignore_index: bool = False)
* axis = 0 (행 기준), 1(열기준), 
* ascending = True(기본값)
* inplace = False(기본값) - 원본 데이터프레임에 반영여부 
* ignore_index = False(기본값) - 기존 인덱스를 무시여부

In [65]:
ages.index
rev_ages = ages.sort_index(ascending = False)
print('내림차순 정렬 : ', rev_ages)
asc_ages = ages.sort_index()
print('올림차순 정렬 : ', asc_ages)

내림차순 정렬 :  7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64
올림차순 정렬 :  0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [67]:
asc_ages + rev_ages

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

#### 정리
* 인덱스 정렬이 다르더라도, 동일한 위치에 있는 값끼리 브로드캐스팅됨을 더한 결과를 통해 알수있음