![panda](figures/panda.png)

 > **Pandas** is an open source Python library for data analysis.
- It gives Python the ability to work with numerical tables and time series for fast data loading, manipulating, aligning, merging, etc.
- The name is derived from 'panel data', an econometrics term for multidimensional structured datasets.

In [1]:
import pandas as pd
import numpy as np

# Series and DataFrame

Pandas introduces two new data types to Python: **Series** and **DataFrame**

## Series

> A Series is a one-dimensional array-like object containing a sequence of values and an associated array of data labels, called its **index**

In [3]:
s = pd.Series([4, 7, -5, 3])
s

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
s_test = pd.Series([4, 7, -5, 3, 'string'])
2 * s_test

0               8
1              14
2             -10
3               6
4    stringstring
dtype: object

- The string representation of a Series displayed interactively shows the index on the
left and the values on the right.
- Since we did not specify an index for the data, a
default one consisting of the integers 0 through n-1 (where n is the length of the data)

In [7]:
s = pd.Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])
s

a    4
b    7
c   -5
d    3
dtype: int64

In [8]:
s.values

array([ 4,  7, -5,  3])

In [9]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

### Selecting single or a set of values using index

In [10]:
s['b']

7

In [12]:
s[['b']]

b    7
dtype: int64

In [11]:
s[['c', 'a', 'b']]

c   -5
a    4
b    7
dtype: int64

In [10]:
s

a    4
b    7
c   -5
d    3
dtype: int64

In [11]:
s[2]

  s[2]


-5

In [12]:
s.iloc[2]

-5

In [13]:
s[1:3]

b    7
c   -5
dtype: int64

In [14]:
s.iloc[[1,3]]

b    7
d    3
dtype: int64

### Filtering

In [15]:
s

a    4
b    7
c   -5
d    3
dtype: int64

In [16]:
s > 0

a     True
b     True
c    False
d     True
dtype: bool

In [17]:
s[s > 0]

a    4
b    7
d    3
dtype: int64

### Math operation

In [18]:
s**2

a    16
b    49
c    25
d     9
dtype: int64

In [19]:
np.exp(s)

a      54.598150
b    1096.633158
c       0.006738
d      20.085537
dtype: float64

In [20]:
s.mean()

2.25

In [21]:
s.var()

26.25

aligns by index label in arithmetic operations

In [22]:
s

a    4
b    7
c   -5
d    3
dtype: int64

In [23]:
s2 = pd.Series([1, 2, 3, 4], index = ['a', 'c', 'd', 'e'])
s2

a    1
c    2
d    3
e    4
dtype: int64

In [24]:
s + s2

a    5.0
b    NaN
c   -3.0
d    6.0
e    NaN
dtype: float64

**Note**: "NaN" stands for missing values in pandas

In [25]:
s.index = ['a', 'c', 'd', 'e']
s

a    4
c    7
d   -5
e    3
dtype: int64

In [26]:
s + s2

a    5
c    9
d   -2
e    7
dtype: int64

## More method for series

In [27]:
 dir(s)

['T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__column_consortium_standard__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__

In [28]:
[attr for attr in dir(s) if not attr.startswith('_')]

['T',
 'a',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'attrs',
 'autocorr',
 'axes',
 'backfill',
 'between',
 'between_time',
 'bfill',
 'bool',
 'c',
 'case_when',
 'clip',
 'combine',
 'combine_first',
 'compare',
 'convert_dtypes',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'd',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'e',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'explode',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'flags',
 'floordiv',
 'ge',
 'get',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'info',
 'interpolate',
 'is_monotonic_decreasing',
 'is_monotonic_in

In [29]:
help(s.all)

Help on method all in module pandas.core.series:

all(axis: 'Axis' = 0, bool_only: 'bool' = False, skipna: 'bool' = True, **kwargs) -> 'bool' method of pandas.core.series.Series instance
    Return whether all elements are True, potentially over an axis.

    Returns True unless there at least one element within a series or
    along a Dataframe axis that is False or equivalent (e.g. zero or
    empty).

    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns', None}, default 0
        Indicate which axis or axes should be reduced. For `Series` this parameter
        is unused and defaults to 0.

        * 0 / 'index' : reduce the index, return a Series whose index is the
          original column labels.
        * 1 / 'columns' : reduce the columns, return a Series whose index is the
          original index.
        * None : reduce all axes, return a scalar.

    bool_only : bool, default False
        Include only boolean columns. Not implemented for Series.
    skipna

In [30]:
all(s > 0)

False

## DataFrame

> A DataFrame represents a rectangular table of data and contains an ordered collection
of columns.

* The DataFrame has both a row and column index.
* Since each column of a DataFrame is essentially a Series with its column index, it can be thought of as a dictionary of Series all sharing the same index.
<!-- * Each column (Series) has to be the same type, whereas, each row can contain mixed types. -->

### Creating DataFrame

#### from a dict of equal-length lists

In [31]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [32]:
d = pd.DataFrame(data)
d

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


#### Start with an empty DataFrame

In [33]:
d1 = pd.DataFrame()
d1

In [34]:
d1['state'] = ['Ohio', 'Nevada']
d1

Unnamed: 0,state
0,Ohio
1,Nevada


In [38]:
d1['year'] = [2001, 2001]
d1['pop'] = [1.7, 2.4]
d1

Unnamed: 0,state,year,pop
0,Ohio,2001,1.7
1,Nevada,2001,2.4


### select columns

In [39]:
d

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [40]:
d['state'] # return a Series

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [41]:
d[['state']] # return a Series

Unnamed: 0,state
0,Ohio
1,Ohio
2,Ohio
3,Nevada
4,Nevada
5,Nevada


In [42]:
type(d['state'])

pandas.core.series.Series

In [43]:
type(d[['state']])

pandas.core.frame.DataFrame

In [44]:
d[['state','pop']]

Unnamed: 0,state,pop
0,Ohio,1.5
1,Ohio,1.7
2,Ohio,3.6
3,Nevada,2.4
4,Nevada,2.9
5,Nevada,3.2


### select rows

In [45]:
rows = np.arange(16).reshape((4, 4))
rows

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [46]:
d2 = pd.DataFrame(rows,
                  index=['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns=['one', 'two', 'three', 'four'])
d2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [47]:
d2.loc['Ohio':"Utah"]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [48]:
d2.iloc[1:3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


### change row index and column name

In [49]:
d2

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [50]:
d2.rename(index={'Colorado':'Connecticut'},columns={'one':'five'})

Unnamed: 0,five,two,three,four
Ohio,0,1,2,3
Connecticut,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [51]:
d2 # notice d2 is still the same

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
d3 = d2.rename(index={'Colorado':'Connecticut'},columns={'one':'five'}) # assign to a new variable
d3

In [52]:
# set the inplace=True will change original DataFrame.
d2.rename(index={'Colorado':'Connecticut'},columns={'one':'five'}, inplace=True)

In [53]:
d2

Unnamed: 0,five,two,three,four
Ohio,0,1,2,3
Connecticut,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### basics attributes and methods

In [54]:
d2.index

Index(['Ohio', 'Connecticut', 'Utah', 'New York'], dtype='object')

In [55]:
d2.columns

Index(['five', 'two', 'three', 'four'], dtype='object')

In [56]:
d2.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [57]:
d2.shape

(4, 4)

In [58]:
d2.mean() # column-wise mean, More on aggregation later.

five     6.0
two      7.0
three    8.0
four     9.0
dtype: float64

### Alignment by index

In [59]:
df3 = pd.DataFrame({'A':[1,2,3]},index=[1,2,3])
df3

Unnamed: 0,A
1,1
2,2
3,3


In [60]:
df4 = pd.DataFrame({'A':[1,2,3]},index=[3,1,2])
df4

Unnamed: 0,A
3,1
1,2
2,3


In [61]:

df3-df4 

Unnamed: 0,A
1,-1
2,-1
3,2


### add and delete rows and columns

In [62]:
d2

Unnamed: 0,five,two,three,four
Ohio,0,1,2,3
Connecticut,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [63]:
d2.drop(index = "Connecticut", columns="five")
# add "inplace=True" will change the original DataFrame

Unnamed: 0,two,three,four
Ohio,1,2,3
Utah,9,10,11
New York,13,14,15


In [64]:
d2

Unnamed: 0,five,two,three,four
Ohio,0,1,2,3
Connecticut,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [65]:
del d2['five'] # this will change d2 directly
d2

Unnamed: 0,two,three,four
Ohio,1,2,3
Connecticut,5,6,7
Utah,9,10,11
New York,13,14,15


In [66]:
d2['one'] = [1, 2, 3, 4] # add new column
d2

Unnamed: 0,two,three,four,one
Ohio,1,2,3,1
Connecticut,5,6,7,2
Utah,9,10,11,3
New York,13,14,15,4


In [67]:
d2.pop('one') # directly change the original DataFrame

Ohio           1
Connecticut    2
Utah           3
New York       4
Name: one, dtype: int64

In [68]:
d2

Unnamed: 0,two,three,four
Ohio,1,2,3
Connecticut,5,6,7
Utah,9,10,11
New York,13,14,15


### Common method

You can import dataset as well

#### csv file

In [75]:
df = pd.read_csv("./data/table.csv")

In [76]:
df

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+
5,S_1,C_2,1201,M,street_5,188,68,97.0,A-
6,S_1,C_2,1202,F,street_4,176,94,63.5,B-
7,S_1,C_2,1203,M,street_6,160,53,58.8,A+
8,S_1,C_2,1204,F,street_5,162,63,33.8,B
9,S_1,C_2,1205,F,street_6,167,63,68.4,B-


#### txt file

In [77]:
df_txt = pd.read_table("data/table.txt")
df_txt

Unnamed: 0,col1,col2,col3,col4
0,2,a,1.4,apple
1,3,b,3.4,banana
2,6,c,2.5,orange
3,5,d,3.2,lemon


In [78]:
help(pd.read_table)

Help on function read_table in module pandas.io.parsers.readers:

read_table(filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]', *, sep: 'str | None | lib.NoDefault' = <no_default>, delimiter: 'str | None | lib.NoDefault' = None, header: "int | Sequence[int] | None | Literal['infer']" = 'infer', names: 'Sequence[Hashable] | None | lib.NoDefault' = <no_default>, index_col: 'IndexLabel | Literal[False] | None' = None, usecols: 'UsecolsArgType' = None, dtype: 'DtypeArg | None' = None, engine: 'CSVEngine | None' = None, converters: 'Mapping[Hashable, Callable] | None' = None, true_values: 'list | None' = None, false_values: 'list | None' = None, skipinitialspace: 'bool' = False, skiprows: 'list[int] | int | Callable[[Hashable], bool] | None' = None, skipfooter: 'int' = 0, nrows: 'int | None' = None, na_values: 'Sequence[str] | Mapping[str, Sequence[str]] | None' = None, keep_default_na: 'bool' = True, na_filter: 'bool' = True, verbose: 'bool | lib.NoDefault' = <no_d

#### xlsx file

In [79]:
conda install openpyxl

Retrieving notices: ...working... done
Channels:
 - defaults
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ossifragus/miniconda3/envs/stat2255

  added / updated specs:
    - openpyxl


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.9.24  |       h06a4308_0         130 KB
    certifi-2024.8.30          |  py312h06a4308_0         163 KB
    et_xmlfile-1.1.0           |  py312h06a4308_1          12 KB
    openpyxl-3.1.5             |  py312h5eee18b_0         719 KB
    ------------------------------------------------------------
                                           Total:        1024 KB

The following NEW packages will be INSTALLED:

  et_xmlfile         pkgs/main/linux-64::et_xmlfile-1.1.0-py312h06a4308_1 
  openpyxl           pkgs/main/linux-64::o

In [80]:
df_excel = pd.read_excel('data/table.xlsx', sheet_name="Sheet1")
df_excel

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+
5,S_1,C_2,1201,M,street_5,188,68,97.0,A-
6,S_1,C_2,1202,F,street_4,176,94,63.5,B-
7,S_1,C_2,1203,M,street_6,160,53,58.8,A+
8,S_1,C_2,1204,F,street_5,162,63,33.8,B
9,S_1,C_2,1205,F,street_6,167,63,68.4,B-


#### Head and Tail

These two methods show the first and the last a few records from a DataFrame, default is 5

In [81]:
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


In [82]:
df.iloc[:6]

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+
5,S_1,C_2,1201,M,street_5,188,68,97.0,A-


In [83]:
df.tail()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
30,S_2,C_4,2401,F,street_2,192,62,45.3,A
31,S_2,C_4,2402,M,street_7,166,82,48.7,B
32,S_2,C_4,2403,F,street_6,158,60,59.7,B+
33,S_2,C_4,2404,F,street_2,160,84,67.7,B
34,S_2,C_4,2405,F,street_6,193,54,47.6,B


In [84]:
df.head(3)

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+


### unique and nunique

In [85]:
df['Physics']

0     A+
1     B+
2     B+
3     B-
4     B+
5     A-
6     B-
7     A+
8      B
9     B-
10    B+
11    A-
12     B
13     A
14    B-
15     C
16    B+
17    B-
18    B+
19     A
20     B
21    B+
22    A+
23    B-
24     B
25    B+
26     A
27     C
28    A-
29     B
30     A
31     B
32    B+
33     B
34     B
Name: Physics, dtype: object

In [86]:
df['Physics'].unique() # Shows only unique values

array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)

In [87]:
df['Physics'].nunique() # len(df['Physics'].unique())

7

### count and value_counts

In [88]:
df['School']

0     S_1
1     S_1
2     S_1
3     S_1
4     S_1
5     S_1
6     S_1
7     S_1
8     S_1
9     S_1
10    S_1
11    S_1
12    S_1
13    S_1
14    S_1
15    S_2
16    S_2
17    S_2
18    S_2
19    S_2
20    S_2
21    S_2
22    S_2
23    S_2
24    S_2
25    S_2
26    S_2
27    S_2
28    S_2
29    S_2
30    S_2
31    S_2
32    S_2
33    S_2
34    S_2
Name: School, dtype: object

In [89]:
df['School'].count() # Count of non missing values

35

In [90]:
df['School'].value_counts()

School
S_2    20
S_1    15
Name: count, dtype: int64

In [91]:
df['Physics'].value_counts()

Physics
B+    9
B     8
B-    6
A     4
A+    3
A-    3
C     2
Name: count, dtype: int64

### describe and and info

In [92]:
df.info() # How many missing for each column and type of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   School   35 non-null     object 
 1   Class    35 non-null     object 
 2   ID       35 non-null     int64  
 3   Gender   35 non-null     object 
 4   Address  35 non-null     object 
 5   Height   35 non-null     int64  
 6   Weight   35 non-null     int64  
 7   Math     35 non-null     float64
 8   Physics  35 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB


In [93]:
df.describe() # summary statistics for numeric type columns

Unnamed: 0,ID,Height,Weight,Math
count,35.0,35.0,35.0,35.0
mean,1803.0,174.142857,74.657143,61.351429
std,536.87741,13.541098,12.895377,19.915164
min,1101.0,155.0,53.0,31.5
25%,1204.5,161.0,63.0,47.4
50%,2103.0,173.0,74.0,61.7
75%,2301.5,187.5,82.0,77.1
max,2405.0,195.0,100.0,97.0


In [94]:
df.describe(percentiles=[x/10 for x in list(range(1, 10, 1))])

Unnamed: 0,ID,Height,Weight,Math
count,35.0,35.0,35.0,35.0
mean,1803.0,174.142857,74.657143,61.351429
std,536.87741,13.541098,12.895377,19.915164
min,1101.0,155.0,53.0,31.5
10%,1104.4,158.4,60.4,33.88
20%,1202.8,160.0,62.8,44.06
30%,1301.2,162.4,64.8,48.74
40%,1304.6,167.0,69.6,51.74
50%,2103.0,173.0,74.0,61.7
60%,2201.4,175.4,79.2,67.98


In [107]:
df['Physics'].describe()

count     35
unique     7
top       B+
freq       9
Name: Physics, dtype: object

### idxmax and nlargest

In [108]:
df['Math']

0     34.0
1     32.5
2     87.2
3     80.4
4     84.8
5     97.0
6     63.5
7     58.8
8     33.8
9     68.4
10    31.5
11    87.7
12    49.7
13    85.2
14    61.7
15    83.3
16    50.6
17    52.5
18    72.2
19    34.2
20    39.1
21    68.5
22    73.8
23    47.2
24    85.4
25    72.3
26    32.7
27    65.9
28    95.5
29    48.9
30    45.3
31    48.7
32    59.7
33    67.7
34    47.6
Name: Math, dtype: float64

In [109]:
df['Math'].max() # return the largest value

97.0

In [110]:
df['Math'].idxmax() # return the index of the largest value

5

In [111]:
df['Math'].idxmin() # return the index of the smallest value

10

In [112]:
df['Math'].nlargest(3) # return the largest 3 values with their index (default is 5).

5     97.0
28    95.5
11    87.7
Name: Math, dtype: float64

In [113]:
df['Math'].nlargest()

5     97.0
28    95.5
11    87.7
2     87.2
24    85.4
Name: Math, dtype: float64

In [114]:
df['Math'].nsmallest(3) # return the smallest 3 values with their index (default is 5).

10    31.5
1     32.5
26    32.7
Name: Math, dtype: float64

### apply

In [115]:
df[["Height", "Weight"]]

Unnamed: 0,Height,Weight
0,173,63
1,192,73
2,186,82
3,167,81
4,159,64
5,188,68
6,176,94
7,160,53
8,162,63
9,167,63


In [116]:
df[["Height", "Weight"]].apply(lambda x: x.max() + x.min())

Height    350
Weight    153
dtype: int64

In [117]:
df[["Height", "Weight"]].apply(lambda x: x.mean())

Height    174.142857
Weight     74.657143
dtype: float64

In [118]:
df[["Height", "Weight"]].mean()

Height    174.142857
Weight     74.657143
dtype: float64

In [None]:
df.apply(lambda x:x.count()) # 0 is column-wise and 1 is row-wise

School     35
Class      35
ID         35
Gender     35
Address    35
Height     35
Weight     35
Math       35
Physics    35
dtype: int64

In [119]:
df.apply(lambda x:x.count(), axis=0) # 0 is column-wise and 1 is row-wise

School     35
Class      35
ID         35
Gender     35
Address    35
Height     35
Weight     35
Math       35
Physics    35
dtype: int64

In [121]:
df.apply(lambda x:x.count(), axis=1) # 0 is column-wise and 1 is row-wise

0     9
1     9
2     9
3     9
4     9
5     9
6     9
7     9
8     9
9     9
10    9
11    9
12    9
13    9
14    9
15    9
16    9
17    9
18    9
19    9
20    9
21    9
22    9
23    9
24    9
25    9
26    9
27    9
28    9
29    9
30    9
31    9
32    9
33    9
34    9
dtype: int64

### sort

In [122]:
df

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+
5,S_1,C_2,1201,M,street_5,188,68,97.0,A-
6,S_1,C_2,1202,F,street_4,176,94,63.5,B-
7,S_1,C_2,1203,M,street_6,160,53,58.8,A+
8,S_1,C_2,1204,F,street_5,162,63,33.8,B
9,S_1,C_2,1205,F,street_6,167,63,68.4,B-


In [123]:
df.sort_values(by='Class')

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
19,S_2,C_1,2105,M,street_4,170,81,34.2,A
18,S_2,C_1,2104,F,street_5,159,97,72.2,B+
16,S_2,C_1,2102,F,street_6,161,61,50.6,B+
15,S_2,C_1,2101,M,street_7,174,84,83.3,C
17,S_2,C_1,2103,M,street_4,157,61,52.5,B-
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


In [124]:
df.sort_values(by=['Address','Height'], ascending=True)

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
11,S_1,C_3,1302,F,street_1,175,57,87.7,A-
23,S_2,C_2,2204,M,street_1,175,74,47.2,B-
33,S_2,C_4,2404,F,street_2,160,84,67.7,B
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
30,S_2,C_4,2401,F,street_2,192,62,45.3,A
13,S_1,C_3,1304,M,street_2,195,70,85.2,A
22,S_2,C_2,2203,M,street_4,155,91,73.8,A+


In [125]:
df.sort_values(by=['Address','Height'], ascending=[False, True])

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
31,S_2,C_4,2402,M,street_7,166,82,48.7,B
15,S_2,C_1,2101,M,street_7,174,84,83.3,C
24,S_2,C_2,2205,F,street_7,183,76,85.4,B
12,S_1,C_3,1303,M,street_7,188,82,49.7,B
27,S_2,C_3,2303,F,street_7,190,99,65.9,C
21,S_2,C_2,2202,F,street_7,194,77,68.5,B+
32,S_2,C_4,2403,F,street_6,158,60,59.7,B+
7,S_1,C_2,1203,M,street_6,160,53,58.8,A+
16,S_2,C_1,2102,F,street_6,161,61,50.6,B+
28,S_2,C_3,2304,F,street_6,164,81,95.5,A-


In [126]:
df.sort_values(by=['Math','Height'], ascending=[False, True])

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
5,S_1,C_2,1201,M,street_5,188,68,97.0,A-
28,S_2,C_3,2304,F,street_6,164,81,95.5,A-
11,S_1,C_3,1302,F,street_1,175,57,87.7,A-
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
24,S_2,C_2,2205,F,street_7,183,76,85.4,B
13,S_1,C_3,1304,M,street_2,195,70,85.2,A
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+
15,S_2,C_1,2101,M,street_7,174,84,83.3,C
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
22,S_2,C_2,2203,M,street_4,155,91,73.8,A+
