# Лекция 3

## Pandas - расширение NumPy (структурированные массивы). Строки и столбцы индексируются метками, а не только числовыми значениями

## Series

In [74]:
import pandas as pd
import numpy as np

In [75]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

print(data)
print(type(data))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [76]:
print(data.values)
print(type(data.values))
print(data.index)
print(type(data.index))

[0.25 0.5  0.75 1.  ]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [77]:
print(data[0])
print(data[1:3])

0.25
1    0.50
2    0.75
dtype: float64


In [78]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['a'])
print(data['b': 'd'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [79]:
print(type(data.index))

<class 'pandas.core.indexes.base.Index'>


In [80]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 10, 7, 'd'])
print(data)
print(data[1])
print(data[10:'d'])

1     0.25
10    0.50
7     0.75
d     1.00
dtype: float64
0.25
10    0.50
7     0.75
d     1.00
dtype: float64


In [81]:
population_dict = {
    "city_1": 1001,
    "city_2": 1002,
    "city_3": 1003,
    "city_4": 1004,
    "city_5": 1005,
}

population = pd.Series(population_dict)
print(population)
print(population["city_4": "city_5"])

city_1    1001
city_2    1002
city_3    1003
city_4    1004
city_5    1005
dtype: int64
city_4    1004
city_5    1005
dtype: int64


### Для создания Series можно использовать: <br>
### - списки Python и массивы NumPy <br>
### - скалярные значения <br>
### - словари <br>

## DataFrame - двумерный массив с явно определёнными индексами. Последовательность "согласованных" по индексу объктов Series.

In [82]:
population_dict = {
    "city_1": 1001,
    "city_2": 1002,
    "city_3": 1003,
    "city_4": 1004,
    "city_5": 1005,
}

area_dict = {
    "city_1": 9991,
    "city_2": 9992,
    "city_3": 9993,
    "city_4": 9994,
    "city_5": 9995,
}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

print(population)
print(area)

city_1    1001
city_2    1002
city_3    1003
city_4    1004
city_5    1005
dtype: int64
city_1    9991
city_2    9992
city_3    9993
city_4    9994
city_5    9995
dtype: int64


In [83]:
states = pd.DataFrame({
    "population": population,
    "area": area,
})

print(states)

print(states.values)
print(states.index)
print(states.columns)

print(type(states))
print(type(states.values))
print(type(states.columns))

        population  area
city_1        1001  9991
city_2        1002  9992
city_3        1003  9993
city_4        1004  9994
city_5        1005  9995
[[1001 9991]
 [1002 9992]
 [1003 9993]
 [1004 9994]
 [1005 9995]]
Index(['city_1', 'city_2', 'city_3', 'city_4', 'city_5'], dtype='object')
Index(['population', 'area'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'pandas.core.indexes.base.Index'>


In [84]:
print(states['area'])

city_1    9991
city_2    9992
city_3    9993
city_4    9994
city_5    9995
Name: area, dtype: int64


### DataFrame. Способы создания: <br>
### - через объект Series <br>
### - списки словарей <br>
### - словари объектов Series <br>
### - двумерный массив NumPy <br>
### - структурированный массив NumPy

## Index - способ организации ссылки на данные объектов Series и DataFrame. Index - неизменяем, упорядочен, является мультимножеством (могут быть повторяющиеся значения).

In [85]:
ind = pd.Index([2, 3, 5, 7, 11])
print(ind[1])
print(ind[::2])

3
Index([2, 5, 11], dtype='int64')


In [86]:
ind[1] = 5

TypeError: Index does not support mutable operations

### Index - следует соглашениям объекта set (Python).

In [25]:
indA = pd.Index([1, 2, 3, 4, 5])
indB = pd.Index([2, 3, 4, 5, 6])

print(indA.intersection(indB))

Index([2, 3, 4, 5], dtype='int64')


## Выборка данных из Series

### Как словарь:

In [29]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print('a' in data)
print('z' in data)
print(data.keys())
print(list(data.items()))

True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [31]:
data['a'] = 100
data['z'] = 1000

print(data)

a     100.00
b       0.50
c       0.75
d       1.00
z    1000.00
dtype: float64


### Как одномерный массив:

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(data['a':'c'])  # последний элемент включается
print(data[0:2])
print(data[(data > 0.5) & (data < 1)])
print(data[['a', 'd']])

a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
c    0.75
dtype: float64
a    0.25
d    1.00
dtype: float64


### Атрибуты-индексаторы

In [38]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 3, 10, 15])

print(data[1])  # не порядковый номер
print(data.loc[1])
print(data.iloc[1])

0.25
0.25
0.5


## Выборка данных из DataFrame

### Как словарь:

In [46]:
data = pd.DataFrame({"area": area, "population": population, "pop": population})
data

Unnamed: 0,area,population,pop
city_1,9991,1001,1001
city_2,9992,1002,1002
city_3,9993,1003,1003
city_4,9994,1004,1004
city_5,9995,1005,1005


In [47]:
print(data["area"])
print(data.area)
print(data.population is data["population"])
print(data.population is data["pop"])

city_1    9991
city_2    9992
city_3    9993
city_4    9994
city_5    9995
Name: area, dtype: int64
city_1    9991
city_2    9992
city_3    9993
city_4    9994
city_5    9995
Name: area, dtype: int64
True
False


In [51]:
data["new"] = data["area"]
data["new1"] = data["area"] / data["pop"]
print(data)

        area  population   pop   new      new1
city_1  9991        1001  1001  9991  9.981019
city_2  9992        1002  1002  9992  9.972056
city_3  9993        1003  1003  9993  9.963111
city_4  9994        1004  1004  9994  9.954183
city_5  9995        1005  1005  9995  9.945274


### Как двумерный массив NumPy:

In [60]:
data = pd.DataFrame({"area": area, "population": population, "pop": population})
print(data.values)
print(data.T)
print(data.values[0])

[[9991 1001 1001]
 [9992 1002 1002]
 [9993 1003 1003]
 [9994 1004 1004]
 [9995 1005 1005]]
            city_1  city_2  city_3  city_4  city_5
area          9991    9992    9993    9994    9995
population    1001    1002    1003    1004    1005
pop           1001    1002    1003    1004    1005
[9991 1001 1001]


### Атрибуты-индексаторы

In [64]:
print(data.iloc[:3, 1:2])
print(data.loc[:"city_4", "population":"pop"])
print(data.loc[data["pop"] > 1002, ["area", "pop"]])

        population
city_1        1001
city_2        1002
city_3        1003
        population   pop
city_1        1001  1001
city_2        1002  1002
city_3        1003  1003
city_4        1004  1004
        area   pop
city_3  9993  1003
city_4  9994  1004
city_5  9995  1005


In [66]:
data.iloc[0, 2] = 99999
data

Unnamed: 0,area,population,pop
city_1,9991,1001,99999
city_2,9992,1002,1002
city_3,9993,1003,1003
city_4,9994,1004,1004
city_5,9995,1005,1005


## Универсальные функции

In [70]:
rng = np.random.default_rng()
s = pd.Series(rng.integers(0, 10, 4))

print(s)
print(np.exp(s))

0    8
1    2
2    6
3    5
dtype: int64
0    2980.957987
1       7.389056
2     403.428793
3     148.413159
dtype: float64


## Несовпадение индексов

In [71]:
population_dict = {
    "city_1": 1001,
    "city_2": 1002,
    "city_3": 1003,
    "city_41": 1004,
    "city_51": 1005,
}

area_dict = {
    "city_1": 9991,
    "city_2": 9992,
    "city_3": 9993,
    "city_42": 9994,
    "city_52": 9995,
}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

In [73]:
states = pd.DataFrame({
    "population": population,
    "area": area,
})

states

Unnamed: 0,population,area
city_1,1001.0,9991.0
city_2,1002.0,9992.0
city_3,1003.0,9993.0
city_41,1004.0,
city_42,,9994.0
city_51,1005.0,
city_52,,9995.0


In [89]:
dfA = pd.DataFrame(rng.integers(0, 10, (2, 2)), columns=['a', 'b'])
dfB = pd.DataFrame(rng.integers(0, 10, (3, 3)), columns=['a', 'b', 'c'])

print(dfA)
print(dfB)
print(dfA + dfB)

   a  b
0  4  3
1  9  9
   a  b  c
0  0  5  4
1  3  9  7
2  3  5  6
      a     b   c
0   4.0   8.0 NaN
1  12.0  18.0 NaN
2   NaN   NaN NaN


In [90]:
rng = np.random.default_rng(1)
A = rng.integers(0, 10, (3, 4))
print(A)

[[4 5 7 9]
 [0 1 8 9]
 [2 3 8 4]]


In [91]:
print(A[0])
print(A - A[0])

[4 5 7 9]
[[ 0  0  0  0]
 [-4 -4  1  0]
 [-2 -2  1 -5]]


In [93]:
df = pd.DataFrame(A, columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,4,5,7,9
1,0,1,8,9
2,2,3,8,4


In [95]:
print(df.iloc[0])
print(df - df.iloc[0])

a    4
b    5
c    7
d    9
Name: 0, dtype: int64
   a  b  c  d
0  0  0  0  0
1 -4 -4  1  0
2 -2 -2  1 -5


In [96]:
print(df - df.iloc[0, ::2])

     a   b    c   d
0  0.0 NaN  0.0 NaN
1 -4.0 NaN  1.0 NaN
2 -2.0 NaN  1.0 NaN


## NA-значения: NaN, null, -99999.

### Pandas. Два способа хранения отсутствующих значений: <br>
### - индикаторы Nan, None <br>
### - null

### None - объект (накладные расходы). Не работает с sum, min.

In [98]:
val1 = np.array([1, 2, 3])
print(val1.sum())

6


In [102]:
val1 = np.array([1, np.nan, 2, 3])

print(val1)
print(val1.sum())
print(np.nansum(val1))

[ 1. nan  2.  3.]
nan
6.0


In [103]:
x = pd.Series(range(10), dtype=int)
print(x)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32


In [104]:
x[0] = None
x[1] = np.nan

x

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [106]:
x1 = pd.Series(['a', 'b', 'c'])
x1[0] = None
x1[1] = np.nan

x1

0    None
1     NaN
2       c
dtype: object

In [107]:
x2 = pd.Series([1, 2, 3, np.nan, None, pd.NA])
x2

0       1
1       2
2       3
3     NaN
4    None
5    <NA>
dtype: object

In [108]:
x3 = pd.Series([1, 2, 3, np.nan, None, pd.NA], dtype='Int32')
x3

0       1
1       2
2       3
3    <NA>
4    <NA>
5    <NA>
dtype: Int32

In [113]:
print(x3.isnull())
print(x3[x3.isnull()])
print(x3[x3.notnull()])
print(x3.dropna())

0    False
1    False
2    False
3     True
4     True
5     True
dtype: bool
3    <NA>
4    <NA>
5    <NA>
dtype: Int32
0    1
1    2
2    3
dtype: Int32
0    1
1    2
2    3
dtype: Int32


In [114]:
df = pd.DataFrame(
    [
        [1, 2, 3, np.nan, None, pd.NA],
        [1, 2, 3, None, 5, 6],
        [1, np.nan, 3, None, np.nan, 6]
    ]
)
print(df)
print(df.dropna())
print(df.dropna(axis=0))
print(df.dropna(axis=1))

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6
   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0  2
0  1  3
1  1  3
2  1  3


### how: <br>
### - all - все значения NA
### - any - хотя бы одно значение
### - thres - x, остаётся, если присутствует МИНИМУМ x непустых значений

In [117]:
print(df.dropna(axis=1, how="all"))
print(df.dropna(axis=1, how="any"))
print(df.dropna(axis=1, thresh=2))

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6
   0  2
0  1  3
1  1  3
2  1  3
   0    1  2    3     5
0  1  2.0  3  NaN  <NA>
1  1  2.0  3  4.0     6
2  1  NaN  3  4.0     6
