In [1]:
import numpy as np
import pandas as pd

In [3]:
dates = ['2025-11-01', '2025-11-02', '2025-11-03', '2025-11-04', '2025-11-05']
datesI = pd.to_datetime(dates)

In [5]:
df1 = pd.DataFrame(np.random.randint(10,20,(5,4)),
                   index = datesI,
                   columns = list('ABCD')
)

In [6]:
df1

Unnamed: 0,A,B,C,D
2025-11-01,13,18,15,17
2025-11-02,13,19,12,18
2025-11-03,14,17,10,15
2025-11-04,12,13,16,17
2025-11-05,17,19,14,19


# Application of functions

In [7]:
df1.describe()

Unnamed: 0,A,B,C,D
count,5.0,5.0,5.0,5.0
mean,13.8,17.2,13.4,17.2
std,1.923538,2.48998,2.408319,1.48324
min,12.0,13.0,10.0,15.0
25%,13.0,17.0,12.0,17.0
50%,13.0,18.0,14.0,17.0
75%,14.0,19.0,15.0,18.0
max,17.0,19.0,16.0,19.0


In [8]:
df1.max()

A    17
B    19
C    16
D    19
dtype: int32

In [9]:
?pd.DataFrame.max

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mDataFrame[0m[1;33m.[0m[0mmax[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis | None'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mskipna[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mnumeric_only[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return the maximum of the values over the requested axis.

If you want the *index* of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``.

Parameters
----------
axis : {index (0), columns (1)}
    Axis for the function to be applied on.
    For `Series` this parameter is unused and defaults to 0.

    For DataFrames, specifying ``axis=None`` will apply the aggregation
    acros

In [10]:
df1.min(axis=1)

2025-11-01    13
2025-11-02    12
2025-11-03    10
2025-11-04    12
2025-11-05    14
dtype: int32

In [11]:
df1.A.max()

17

In [16]:
df1.iloc[0].min()

13

In [17]:
# Find the maximum value in the entire DataFrame
df1.max().max()

19

In [18]:
df1.A.mode()

0    13
Name: A, dtype: int32

In [19]:
type(df1.A.mode())

pandas.core.series.Series

In [20]:
df1.A.mode()[0]

13

## apply

In [21]:
df1.apply(max)

A    17
B    19
C    16
D    19
dtype: int64

In [22]:
df1.apply(max,axis=1)

2025-11-01    18
2025-11-02    19
2025-11-03    17
2025-11-04    17
2025-11-05    19
dtype: int64

In [23]:
df1['diff'] = df1.max(axis=1) - df1.min(axis=1)

In [24]:
df1

Unnamed: 0,A,B,C,D,diff
2025-11-01,13,18,15,17,5
2025-11-02,13,19,12,18,7
2025-11-03,14,17,10,15,7
2025-11-04,12,13,16,17,5
2025-11-05,17,19,14,19,5


In [25]:
f = lambda x: x.max() - x.min()

In [26]:
df1.apply(f)

A       5
B       6
C       6
D       4
diff    2
dtype: int64

In [28]:
df1.apply(f, axis=1)
# 列表增加了一列（diff）所以结果跟[24]中的diff不一样

2025-11-01    13
2025-11-02    12
2025-11-03    10
2025-11-04    12
2025-11-05    14
dtype: int64

In [29]:
df2 = df1.drop('diff', axis=1)

In [30]:
df2

Unnamed: 0,A,B,C,D
2025-11-01,13,18,15,17
2025-11-02,13,19,12,18
2025-11-03,14,17,10,15
2025-11-04,12,13,16,17
2025-11-05,17,19,14,19


In [31]:
df2.apply(f, axis=1)

2025-11-01    5
2025-11-02    7
2025-11-03    7
2025-11-04    5
2025-11-05    5
dtype: int64

In [32]:
type(df2.apply(f, axis=1))

pandas.core.series.Series

In [33]:
df2['diff'] = df2.apply(f, axis=1)

In [34]:
df2

Unnamed: 0,A,B,C,D,diff
2025-11-01,13,18,15,17,5
2025-11-02,13,19,12,18,7
2025-11-03,14,17,10,15,7
2025-11-04,12,13,16,17,5
2025-11-05,17,19,14,19,5


# Concatenating datasets
## merge

In [35]:
from pandas import Series,DataFrame

In [40]:
dfa = DataFrame(
    {
        '学号' : ['#001', '#002', '#003'],
        '语文' : [87,98,67],
        '数学' : [92,86,78],
        '英语' : [100,87,98]
    },
    index = [1,5,4])

In [41]:
dfb = DataFrame(
    {
        '学号' : ['#001', '#002', '#003'],
        '体育' : [83,68,87]
    })

In [42]:
dfb

Unnamed: 0,学号,体育
0,#001,83
1,#002,68
2,#003,87


In [43]:
pd.merge(dfa,dfb)

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87,92,100,83
1,#002,98,86,87,68
2,#003,67,78,98,87


In [44]:
df_m1 = pd.merge(dfa,dfb)

In [46]:
df_m1 #indexs changed

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87,92,100,83
1,#002,98,86,87,68
2,#003,67,78,98,87


In [47]:
dfc = DataFrame(
    {
        '学号' : ['#001', '#002', '#004'],
        '体育' : [83,68,77]
    })

In [48]:
df_m2 = pd.merge(dfa, dfc, on = '学号', how = 'inner') 

In [49]:
df_m2

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87,92,100,83
1,#002,98,86,87,68


In [50]:
df_m3 = pd.merge(dfa, dfc, on = '学号', how = 'left') 

In [51]:
df_m3

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87,92,100,83.0
1,#002,98,86,87,68.0
2,#003,67,78,98,


In [52]:
df_m4 = pd.merge(dfa, dfc, on = '学号', how = 'right') 

In [53]:
df_m4

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87.0,92.0,100.0,83
1,#002,98.0,86.0,87.0,68
2,#004,,,,77


In [54]:
df_m5 = pd.merge(dfa, dfc, on = '学号', how = 'outer')

In [55]:
df_m5

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87.0,92.0,100.0,83.0
1,#002,98.0,86.0,87.0,68.0
2,#003,67.0,78.0,98.0,
3,#004,,,,77.0


## concat

In [56]:
dfd = DataFrame(
    {
        '学号' : ['#004', '#005', '#003'],
        '语文' : [87,98,67],
        '数学' : [92,86,78],
        '英语' : [100,87,98]
    }
)

In [57]:
dfd

Unnamed: 0,学号,语文,数学,英语
0,#004,87,92,100
1,#005,98,86,87
2,#003,67,78,98


In [58]:
df_m6 = pd.concat([dfa, dfd])

In [59]:
df_m6

Unnamed: 0,学号,语文,数学,英语
1,#001,87,92,100
5,#002,98,86,87
4,#003,67,78,98
0,#004,87,92,100
1,#005,98,86,87
2,#003,67,78,98


In [60]:
df_m6.loc[1]

Unnamed: 0,学号,语文,数学,英语
1,#001,87,92,100
1,#005,98,86,87


In [61]:
df_m6.reset_index()

Unnamed: 0,index,学号,语文,数学,英语
0,1,#001,87,92,100
1,5,#002,98,86,87
2,4,#003,67,78,98
3,0,#004,87,92,100
4,1,#005,98,86,87
5,2,#003,67,78,98


In [63]:
?DataFrame.reset_index

[1;31mSignature:[0m
[0mDataFrame[0m[1;33m.[0m[0mreset_index[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0mlevel[0m[1;33m:[0m [1;34m'IndexLabel | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mdrop[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mcol_level[0m[1;33m:[0m [1;34m'Hashable'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mcol_fill[0m[1;33m:[0m [1;34m'Hashable'[0m [1;33m=[0m [1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mallow_duplicates[0m[1;33m:[0m [1;34m'bool | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mnames[0m[1;33m:[0m [1;34m'Hashable | Sequence[Hashable] | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m 

In [65]:
df_m6.reset_index(drop = True, inplace = True)

In [66]:
df_m6

Unnamed: 0,学号,语文,数学,英语
0,#001,87,92,100
1,#002,98,86,87
2,#003,67,78,98
3,#004,87,92,100
4,#005,98,86,87
5,#003,67,78,98


# Data Cleaning
## Missing value handling

In [68]:
dfx = pd.merge(dfa, dfc, how = 'outer')

In [69]:
dfx

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87.0,92.0,100.0,83.0
1,#002,98.0,86.0,87.0,68.0
2,#003,67.0,78.0,98.0,
3,#004,,,,77.0


In [70]:
dfx.isnull()

Unnamed: 0,学号,语文,数学,英语,体育
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,True
3,False,True,True,True,False


In [71]:
dfx.notnull()

Unnamed: 0,学号,语文,数学,英语,体育
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,False
3,True,False,False,False,True


In [72]:
dfx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   学号      4 non-null      object 
 1   语文      3 non-null      float64
 2   数学      3 non-null      float64
 3   英语      3 non-null      float64
 4   体育      3 non-null      float64
dtypes: float64(4), object(1)
memory usage: 292.0+ bytes


In [73]:
dfx['语文'].isnull()

0    False
1    False
2    False
3     True
Name: 语文, dtype: bool

In [74]:
dfx[dfx['语文'].isnull()]

Unnamed: 0,学号,语文,数学,英语,体育
3,#004,,,,77.0


In [76]:
dfx['语文'][dfx['语文'].isnull()]

3   NaN
Name: 语文, dtype: float64

In [77]:
dfx.fillna(0)

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87.0,92.0,100.0,83.0
1,#002,98.0,86.0,87.0,68.0
2,#003,67.0,78.0,98.0,0.0
3,#004,0.0,0.0,0.0,77.0


In [78]:
dfx['体育'].fillna(0)

0    83.0
1    68.0
2     0.0
3    77.0
Name: 体育, dtype: float64

In [79]:
dfx['体育'].mean()

76.0

In [80]:
dfx.体育.fillna(dfx['体育'].mean())

0    83.0
1    68.0
2    76.0
3    77.0
Name: 体育, dtype: float64

In [81]:
dfx

Unnamed: 0,学号,语文,数学,英语,体育
0,#001,87.0,92.0,100.0,83.0
1,#002,98.0,86.0,87.0,68.0
2,#003,67.0,78.0,98.0,
3,#004,,,,77.0


In [83]:
dfx['体育成绩的填补'] = dfx.体育.fillna(dfx['体育'].mean())
dfx #比起前面pd.DataFrame.add的fill_value操作，这里是直接对结果填充，前者是先填充后计算

Unnamed: 0,学号,语文,数学,英语,体育,体育成绩的填补
0,#001,87.0,92.0,100.0,83.0,83.0
1,#002,98.0,86.0,87.0,68.0,68.0
2,#003,67.0,78.0,98.0,,76.0
3,#004,,,,77.0,77.0


In [85]:
df01 = DataFrame(np.arange(1,13).reshape(3,4),
                index = list('ABC'),
                columns = list('abcd'))
df01

Unnamed: 0,a,b,c,d
A,1,2,3,4
B,5,6,7,8
C,9,10,11,12


In [86]:
df02 = DataFrame(np.arange(1,13).reshape(4,3),
                index = list('BCDE'),
                columns = list('cde'))
df02

Unnamed: 0,c,d,e
B,1,2,3
C,4,5,6
D,7,8,9
E,10,11,12


In [87]:
dfx1 = df1 + df2

In [88]:
dfx1

Unnamed: 0,A,B,C,D,diff
2025-11-01,26,36,30,34,10
2025-11-02,26,38,24,36,14
2025-11-03,28,34,20,30,14
2025-11-04,24,26,32,34,10
2025-11-05,34,38,28,38,10


In [89]:
dfx2 = df01.add(df02, fill_value=0)

In [90]:
dfx2

Unnamed: 0,a,b,c,d,e
A,1.0,2.0,3.0,4.0,
B,5.0,6.0,8.0,10.0,3.0
C,9.0,10.0,15.0,17.0,6.0
D,,,7.0,8.0,9.0
E,,,10.0,11.0,12.0


In [91]:
dfx3 = df01.add(df02).fillna(0)

In [92]:
dfx3

Unnamed: 0,a,b,c,d,e
A,0.0,0.0,0.0,0.0,0.0
B,0.0,0.0,8.0,10.0,0.0
C,0.0,0.0,15.0,17.0,0.0
D,0.0,0.0,0.0,0.0,0.0
E,0.0,0.0,0.0,0.0,0.0


In [93]:
dfx4 = df01.add(df02, fill_value=0).fillna(0)

In [94]:
dfx4

Unnamed: 0,a,b,c,d,e
A,1.0,2.0,3.0,4.0,0.0
B,5.0,6.0,8.0,10.0,3.0
C,9.0,10.0,15.0,17.0,6.0
D,0.0,0.0,7.0,8.0,9.0
E,0.0,0.0,10.0,11.0,12.0


In [95]:
dfx

Unnamed: 0,学号,语文,数学,英语,体育,体育成绩的填补
0,#001,87.0,92.0,100.0,83.0,83.0
1,#002,98.0,86.0,87.0,68.0,68.0
2,#003,67.0,78.0,98.0,,76.0
3,#004,,,,77.0,77.0


In [96]:
dfx.dropna()

Unnamed: 0,学号,语文,数学,英语,体育,体育成绩的填补
0,#001,87.0,92.0,100.0,83.0,83.0
1,#002,98.0,86.0,87.0,68.0,68.0


In [97]:
dfx.dropna(axis=1)

Unnamed: 0,学号,体育成绩的填补
0,#001,83.0
1,#002,68.0
2,#003,76.0
3,#004,77.0


In [100]:
dfy = DataFrame(
{
    'c1' : ['apple']*3 + ['banana']*3 + ['apple'],
    'c2' : ['a', 'a', 4, 4, 'b', 'b', 'a']
})

In [101]:
dfy

Unnamed: 0,c1,c2
0,apple,a
1,apple,a
2,apple,4
3,banana,4
4,banana,b
5,banana,b
6,apple,a


In [102]:
dfy.duplicated()

0    False
1     True
2    False
3    False
4    False
5     True
6     True
dtype: bool

In [104]:
?DataFrame.duplicated

[1;31mSignature:[0m
[0mDataFrame[0m[1;33m.[0m[0mduplicated[0m[1;33m([0m[1;33m
[0m    [0mself[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [1;34m'Hashable | Sequence[Hashable] | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mkeep[0m[1;33m:[0m [1;34m'DropKeep'[0m [1;33m=[0m [1;34m'first'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'Series'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return boolean Series denoting duplicate rows.

Considering certain columns is optional.

Parameters
----------
subset : column label or sequence of labels, optional
    Only consider certain columns for identifying duplicates, by
    default use all of the columns.
keep : {'first', 'last', False}, default 'first'
    Determines which duplicates (if any) to mark.

    - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
    - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
    - Fals

In [105]:
dfy.duplicated('c1')

0    False
1     True
2     True
3    False
4     True
5     True
6     True
dtype: bool

In [106]:
dfy.drop_duplicates()

Unnamed: 0,c1,c2
0,apple,a
2,apple,4
3,banana,4
4,banana,b


In [107]:
dfy.drop_duplicates('c1')

Unnamed: 0,c1,c2
0,apple,a
3,banana,4
