In [1]:
import numpy as np
import pandas as pd

## Немного ни о чем

Поговорим про разные представления для пропущенных значений

In [2]:
df_float = pd.DataFrame(
    {
        'column_none': [1., 2., 3., 4., 5., None],
        'column_nan': [1., 2., 3., 4., 5., np.nan],
    }
)

In [3]:
np.array([1., 2., 3., 4., 5., None], dtype='object')

array([1.0, 2.0, 3.0, 4.0, 5.0, None], dtype=object)

In [4]:
df_float['column_none'] == df_float['column_nan']

0     True
1     True
2     True
3     True
4     True
5    False
dtype: bool

In [5]:
df_float

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [6]:
df_float.loc[5]

column_none   NaN
column_nan    NaN
Name: 5, dtype: float64

In [7]:
df_float['column_none'].dtype, df_float['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Видим, что None скастовался в np.nan и стал float. Что будет, если у нас изначально данные из целых чисел?

In [8]:
df_int = pd.DataFrame(
    {
        'column_none': [1, 2, 3, 4, 5, None],
        'column_nan': [1, 2, 3, 4, 5, np.nan],
    }
)

In [9]:
df_int['column_none'] == df_int['column_nan']

0     True
1     True
2     True
3     True
4     True
5    False
dtype: bool

In [10]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [11]:
df_int

Unnamed: 0,column_none,column_nan
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Тоже получили конвертацию None в NaN

In [12]:
df_int.loc[5]

column_none   NaN
column_nan    NaN
Name: 5, dtype: float64

In [13]:
df_int['column_none'].dtype, df_int['column_nan'].dtype

(dtype('float64'), dtype('float64'))

Произошел каст, вероятно не очень желательный

In [14]:
df_int['column_nan'] = df_int['column_nan'].astype(np.int16)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

Просто так скастить в integer, не выбрасывая NaN нельзя

In [None]:
df_int['column_nan'] = df_int['column_nan'].astype("Int16")

О, что-то получилось?

In [None]:
df_int

In [15]:
df_int['column_none'].nbytes, df_int['column_nan'].nbytes

(48, 48)

In [16]:
df_int.column_none.loc[5]

nan

In [17]:
df_int.column_nan.loc[5]

nan

Получили еще одну версию для "ничего"?))0)

In [18]:
df_int.isna()

Unnamed: 0,column_none,column_nan
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,True,True


In [19]:
df_int['column_none'].loc[5].nbytes

8

In [20]:
df_int.memory_usage(index=False)

column_none    48
column_nan     48
dtype: int64

In [21]:
df_int.columns = ['column_1', 'column_2']

In [22]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


In [23]:
df_int.loc[1, 'column_1'] = None
df_int.loc[1, 'column_2'] = None

In [24]:
df_int

Unnamed: 0,column_1,column_2
0,1.0,1.0
1,,
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0
5,,


Произошел еще один type cast

In [25]:
nas = df_int.column_2[df_int.column_2.isna()]

In [26]:
nas.iloc[0] == nas.iloc[1]

False

In [27]:
nas.iloc[0] is nas.iloc[1]

False

!!!

In [28]:
id(nas.iloc[0]), id(nas.iloc[1])

(5012871696, 5012871696)

Это один и тот же объект!

In [29]:
nans = df_int.column_1[df_int.column_1.isna()]

In [30]:
nans

1   NaN
5   NaN
Name: column_1, dtype: float64

In [31]:
a, b = nans.values

In [32]:
a, b

(nan, nan)

In [33]:
a == b

False

In [34]:
a is b

False

In [35]:
id(a), id(b)

(5012870192, 5012866352)

И напоследок

In [36]:
set([float('nan'), float('nan')])

{nan, nan}

In [37]:
set([np.float64('nan'), np.float64('nan')])

{nan, nan}

In [38]:
set([pd.NA, pd.NA])

{<NA>}

In [39]:
set([np.nan, np.nan])

{nan}

In [40]:
np.nan is np.nan is np.NaN is np.NAN

True

In [41]:
pd.NA is pd.NA

True

In [42]:
type(1 + pd.NA)

pandas._libs.missing.NAType

In [43]:
id(1 + pd.NA), id(pd.NA)

(4493530832, 4493530832)

In [44]:
np.nan is np.NaN is np.NAN

True

In [45]:
type(1 + np.nan)

float

In [46]:
id(np.nan), id(np.nan + 1)

(4441670128, 4814872464)

## Задачи
Почему индексы важны: потому что при element-wise операциях происходит мэтчинг по индексам.

In [47]:
series1 = pd.Series([0, 1, 2])
series2 = pd.Series([0, 1, 2], index=[2, 1, 0])
series3 = pd.Series([0, 1, 2], index=[1, 1, 1])

In [48]:
series1 - series1

0    0
1    0
2    0
dtype: int64

In [49]:
series1 - series2

0   -2
1    0
2    2
dtype: int64

In [50]:
series1 - series3

0    NaN
1    1.0
1    0.0
1   -1.0
2    NaN
dtype: float64

In [51]:
# np.array, так что все ок
series1.values - series3.values

array([0, 0, 0])

###  Given series A and series B

In [52]:
series_a = pd.Series([1, 2, 4, 3])
series_b = pd.Series([3, 4, 5, 6])

- Items is series A not present in series B

In [53]:
series_a[~series_a.isin(series_b)]

0    1
1    2
dtype: int64

- Intersection of series

In [54]:
np.intersect1d(series_a, series_b)

array([3, 4])

In [55]:
set(series_a) & set(series_b)

{3, 4}

- Items presented only in one of the series, not in both

In [56]:
np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a)

(array([1, 2]), array([5, 6]))

In [57]:
np.union1d(np.setdiff1d(series_a, series_b), np.setdiff1d(series_b, series_a))

array([1, 2, 5, 6])

или

In [58]:
np.setxor1d(series_a, series_b)

array([1, 2, 5, 6])

или

In [59]:
series_union = pd.Series(np.union1d(series_a, series_b))
series_intersect = pd.Series(np.intersect1d(series_a, series_b))

series_union, series_intersect

(0    1
 1    2
 2    3
 3    4
 4    5
 5    6
 dtype: int64,
 0    3
 1    4
 dtype: int64)

In [60]:
series_union[~series_union.isin(series_intersect)]

0    1
1    2
4    5
5    6
dtype: int64

### Merge by column pairs: fruit-pazham, weight-kilo

In [61]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [62]:
df1.head()

Unnamed: 0,fruit,weight,price
0,apple,high,10
1,banana,medium,10
2,orange,low,11
3,apple,high,4
4,banana,medium,10


In [63]:
df2.head()

Unnamed: 0,pazham,kilo,price
0,apple,high,3
1,orange,low,9
2,pine,high,0
3,apple,low,0
4,orange,high,12


In [67]:
df_merged = pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on = ['pazham', 'kilo'], suffixes=('_left', '_right'))
df_merged.head()

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,10,apple,high,3
1,orange,low,11,orange,low,9
2,apple,high,4,apple,high,3
3,orange,low,9,orange,low,9
4,apple,high,10,apple,high,3


Lets explore dropping duplicate rows

In [68]:
df_merged.drop_duplicates(keep='first')

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,10,apple,high,3
1,orange,low,11,orange,low,9
2,apple,high,4,apple,high,3
3,orange,low,9,orange,low,9
5,orange,low,14,orange,low,9


###  Reverse all rows (first row should become last etc.)

In [69]:
df = pd.DataFrame(np.arange(30).reshape(-1, 6), columns=map(lambda x: f'column_{x}', range(6)))

In [70]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29


In [71]:
df[::-1]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


In [73]:
df.iloc[::-1, :]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5
4,24,25,26,27,28,29
3,18,19,20,21,22,23
2,12,13,14,15,16,17
1,6,7,8,9,10,11
0,0,1,2,3,4,5


### Get column name with the highest number of row-wise maximum’s in dataframe

In [74]:
df = pd.DataFrame(np.random.randint(0, 100, 50).reshape(-1, 5), columns=map(lambda x: f'column_{x}', range(5)))

In [75]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,82,92,50,39,87
1,14,42,4,21,9
2,20,79,99,24,51
3,8,86,86,59,29
4,96,48,3,4,39
5,71,43,63,76,47
6,14,59,23,41,8
7,28,59,0,4,82
8,85,91,32,20,17
9,58,62,71,42,62


вариант 1:

In [79]:
df.columns[(df.values == df.values.max(axis=1)[:, None]).sum(axis=0).argmax()]

'column_1'

вариант 2:

In [80]:
df.idxmax(axis=1).value_counts().index[0]

'column_1'

### Find the positions of numbers that are multiples of N

In [81]:
N = 5

In [82]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,82,92,50,39,87
1,14,42,4,21,9
2,20,79,99,24,51
3,8,86,86,59,29
4,96,48,3,4,39
5,71,43,63,76,47
6,14,59,23,41,8
7,28,59,0,4,82
8,85,91,32,20,17
9,58,62,71,42,62


For each column Series separately 

In [83]:
pos_map = df.apply(lambda x: x % N == 0)
for col in pos_map.columns:
    ser = pos_map.loc[:, col]
    print(f'{col}: {ser[ser].index.tolist()}')

column_0: [2, 8]
column_1: []
column_2: [0, 7]
column_3: [8]
column_4: []


```
column_0: [0, 2, 6, 7]
column_1: [0, 5]
column_2: [6, 9]
column_3: [7, 8]
column_4: [5, 8]
```

Now try to treat rows and columns as coordinates. Return list (or array) of pairs for such elements. (one-liner)

In [84]:
np.argwhere(df.values % N == 0).tolist()

[[0, 2], [2, 0], [7, 2], [8, 0], [8, 3]]

```[[0, 0],
 [0, 1],
 [2, 0],
 [5, 1],
 [5, 4],
 [6, 0],
 [6, 2],
 [7, 0],
 [7, 3],
 [8, 3],
 [8, 4],
 [9, 2]]
 ```

### Normalize all columns of df by subtracting the column mean and divide by standard deviation.

In [87]:
df.head()

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,82,92,50,39,87
1,14,42,4,21,9
2,20,79,99,24,51
3,8,86,86,59,29
4,96,48,3,4,39


In [104]:
(df.values - df.values.mean(axis=0)) / df.values.std(axis=0)

array([[ 1.05896796,  1.40196762,  0.20164666,  0.27437622,  1.63983008],
       [-1.0343408 , -1.30453357, -1.14266443, -0.54875243, -1.27376323],
       [-0.84963708,  0.69827731,  1.63363022, -0.41156433,  0.29509471],
       [-1.21904451,  1.07718747,  1.25371622,  1.18896361, -0.52668802],
       [ 1.48994329, -0.97975343, -1.17188859, -1.32615172, -0.15315042],
       [ 0.72034448, -1.25040355,  0.58156067,  1.96636289,  0.14567967],
       [-1.0343408 , -0.38432317, -0.5874055 ,  0.36583496, -1.31111699],
       [-0.60336547, -0.38432317, -1.25956105, -1.32615172,  1.45306128],
       [ 1.15131982,  1.34783759, -0.32438811, -0.5944818 , -0.97493314],
       [ 0.3201531 , -0.2219331 ,  0.8153539 ,  0.41156433,  0.70598607]])

### Range all columns of df such that the minimum value in each column is 0 and max is 1

In [88]:
df.head()

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,82,92,50,39,87
1,14,42,4,21,9
2,20,79,99,24,51
3,8,86,86,59,29
4,96,48,3,4,39


In [105]:
(df.values - df.values.min(axis=0)) / (df.values.max(axis=0) - df.values.min(axis=0))

array([[0.84090909, 1.        , 0.50505051, 0.48611111, 1.        ],
       [0.06818182, 0.        , 0.04040404, 0.23611111, 0.01265823],
       [0.13636364, 0.74      , 1.        , 0.27777778, 0.5443038 ],
       [0.        , 0.88      , 0.86868687, 0.76388889, 0.26582278],
       [1.        , 0.12      , 0.03030303, 0.        , 0.39240506],
       [0.71590909, 0.02      , 0.63636364, 1.        , 0.49367089],
       [0.06818182, 0.34      , 0.23232323, 0.51388889, 0.        ],
       [0.22727273, 0.34      , 0.        , 0.        , 0.93670886],
       [0.875     , 0.98      , 0.32323232, 0.22222222, 0.11392405],
       [0.56818182, 0.4       , 0.71717172, 0.52777778, 0.6835443 ]])

### Create a column that contains the second-largest value in each row?

In [89]:
df.head()

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,82,92,50,39,87
1,14,42,4,21,9
2,20,79,99,24,51
3,8,86,86,59,29
4,96,48,3,4,39


In [106]:
df.apply(lambda row: sorted(row.values, reverse=True)[1], axis=1)

0    87
1    21
2    79
3    86
4    48
5    71
6    41
7    59
8    85
9    62
dtype: int64

### Split a text column into two separate columns?

In [107]:
df_text = pd.DataFrame(
    {
        'row':
        [
            'id\t Name, Surname',
            '2\t Nadal, Raphael',
            '5\t Djokovic,  Novak',
            '1\t Federer, Roger'
        ]
    }
)
df_text

Unnamed: 0,row
0,"id\t Name, Surname"
1,"2\t Nadal, Raphael"
2,"5\t Djokovic, Novak"
3,"1\t Federer, Roger"


In [110]:
df_text['row'].str.split('\t', expand=True)

Unnamed: 0,0,1
0,id,"Name, Surname"
1,2,"Nadal, Raphael"
2,5,"Djokovic, Novak"
3,1,"Federer, Roger"


---

Больше [упражнений на Pandas с решениями](https://www.machinelearningplus.com/python/101-pandas-exercises-python/)