In [34]:
# Import necessary librairies
import pandas as pd
import numpy as np

## Case 1: Pandas basic

In [4]:
# create a pandas series from dictionary
d = {'a': 1, 'b':2, 'c':3}
df = pd.Series(d)
df

a    1
b    2
c    3
dtype: int64

In [5]:
type(df)

pandas.core.series.Series

In [6]:
dates = pd.date_range('today', periods=6) # next 6days include today
num_arr = np.random.randn(6,4)
columns = ['A', 'B', 'C', 'D']
df = pd.DataFrame(num_arr
                 , index = dates
                 , columns = columns)
df

Unnamed: 0,A,B,C,D
2019-12-03 14:58:36.420368,1.23318,-0.628688,-0.712411,-1.006909
2019-12-04 14:58:36.420368,0.293054,1.808738,-0.851832,0.222028
2019-12-05 14:58:36.420368,0.731387,-0.405047,-0.109646,0.310462
2019-12-06 14:58:36.420368,-0.159579,0.204044,-0.276567,0.568432
2019-12-07 14:58:36.420368,-0.314534,1.143392,-0.948041,0.485254
2019-12-08 14:58:36.420368,-0.730229,1.901608,-1.418007,-0.279357


In [7]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [8]:
print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
animal      10 non-null object
age         8 non-null float64
visits      10 non-null int64
priority    10 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes
None


Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


In [10]:
# search by index [row, column]
df.iloc[:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [11]:
# search by column names 
print(df.loc[:, ['animal', 'age']], '\n'*2)
print(df.loc[df.index[[3, 4, 8]], ['animal', 'age']], '\n'*2)
print(df[df['age'] > 3], '\n'*2)

  animal  age
a    cat  2.5
b    cat  3.0
c  snake  0.5
d    dog  NaN
e    dog  5.0
f    cat  2.0
g  snake  4.5
h    cat  NaN
i    dog  7.0
j    dog  3.0 


  animal  age
d    dog  NaN
e    dog  5.0
i    dog  7.0 


  animal  age  visits priority
e    dog  5.0       2       no
g  snake  4.5       1       no
i    dog  7.0       2       no 




In [12]:
df[(df['age']>2) & (df['age']>4)]
# 方法二
# df[df['age'].between(2, 4)]

Unnamed: 0,animal,age,visits,priority
e,dog,5.0,2,no
g,snake,4.5,1,no
i,dog,7.0,2,no


In [13]:
print(df['age'].mean(), '\n')
print(df.groupby('animal')['age'].mean(), '\n')
# sort the data by age and visits, 
df.sort_values(by=['age', 'visits'], ascending=[False, True])

3.4375 

animal
cat      2.5
dog      5.0
snake    2.5
Name: age, dtype: float64 



Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,2.0,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


In [19]:
df.loc['k'] = [5.5,'dog', 'no', 2]
df
df.drop('k')
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5,2,no
f,cat,2,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7,2,no
j,dog,3,1,no


In [21]:
df['animal'].value_counts()

dog      4
cat      4
snake    2
5.5      1
Name: animal, dtype: int64

In [23]:
# sort data by columns
# Ascending order: true
# Descending order: false
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
k,5.5,dog,no,2
i,dog,7,2,no
e,dog,5,2,no
g,snake,4.5,1,no
j,dog,3,1,no
b,cat,3,3,yes
a,cat,2.5,1,yes
f,cat,2,3,no
c,snake,0.5,2,no
h,cat,,1,yes


In [30]:
# map function to replace the data in dataframe
print('The format of DataFrame is {}'.format(type(df)))
print('The format of fraction of DataFrame is {}'.format(type(df['priority'])))
# only the dataframe.series has map function
df['priority'] = df['priority'].map({'yes': True, 'no': False})
df

The format of DataFrame is <class 'pandas.core.frame.DataFrame'>
The format of fraction of DataFrame is <class 'pandas.core.series.Series'>


Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,
b,cat,3,3,
c,snake,0.5,2,
d,dog,,3,
e,dog,5,2,
f,cat,2,3,
g,snake,4.5,1,
h,cat,,1,
i,dog,7,2,
j,dog,3,1,


In [32]:
# replace function to replace the data in dataframe
df['animal'] = df['animal'].replace('snake', 'python')
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,
b,cat,3,3,
c,python,0.5,2,
d,dog,,3,
e,dog,5,2,
f,cat,2,3,
g,python,4.5,1,
h,cat,,1,
i,dog,7,2,
j,dog,3,1,


## Case2: Pandas advance

In [37]:
# Delete the rows have duplicated values
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
print(df)
df1 = df.loc[df['A'].shift() != df['A']]
# 方法二
# df1 = df.drop_duplicates(subset='A')
print(df1)

    A
0   1
1   2
2   2
3   3
4   4
5   5
6   5
7   5
8   6
9   7
10  7
   A
0  1
1  2
3  3
4  4
5  5
8  6
9  7


In [40]:
# the value per cell minus the mean of the entire row
df = pd.DataFrame(np.random.random(size=(5, 3)))
print(df)
df1 = df.sub(df.mean(axis=1), axis=0)
print(df1)

          0         1         2
0  0.401381  0.316625  0.808087
1  0.531055  0.270467  0.215309
2  0.804145  0.649847  0.367860
3  0.037463  0.965938  0.212556
4  0.104950  0.718546  0.408406
          0         1         2
0 -0.107316 -0.192073  0.299389
1  0.192111 -0.068477 -0.123635
2  0.196861  0.042563 -0.239424
3 -0.367856  0.560619 -0.192763
4 -0.305684  0.307912 -0.002228


In [44]:
df = pd.DataFrame(np.random.random(size=(5, 5)), columns=list('abcde'))
print(df)
print(df.sum()) # sum by column
df.sum().idxmin() 

          a         b         c         d         e
0  0.589855  0.471419  0.825621  0.363500  0.574413
1  0.195670  0.598629  0.777964  0.056283  0.643563
2  0.500899  0.303324  0.434528  0.681805  0.342195
3  0.743060  0.927302  0.767141  0.984717  0.243623
4  0.144595  0.558797  0.434577  0.407042  0.424006
a    2.174078
b    2.859471
c    3.239832
d    2.493347
e    2.227800
dtype: float64


'a'