## Pandas - библиотека для работы с данными

In [1]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.4.3-cp310-cp310-win_amd64.whl (10.5 MB)
     -------------------------------------- 10.5/10.5 MB 354.1 kB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.2.1-py2.py3-none-any.whl (500 kB)
     ------------------------------------ 500.6/500.6 kB 259.3 kB/s eta 0:00:00
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.3 pytz-2022.2.1


In [2]:
import pandas as pd

- Объект класса (типа) Series

In [3]:
# Series - одномерный массив индексированных данных
s1 = pd.Series([10, 20, 30, 40])
s1

0    10
1    20
2    30
3    40
dtype: int64

In [4]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
s1.loc

<pandas.core.indexing._LocIndexer at 0x553cf82c0>

In [6]:
s1.loc[0]

10

In [7]:
# создание на базе словаря
d1 = {'a':100, 5:20, 0:50}
s2 = pd.Series(d1)
s2

a    100
5     20
0     50
dtype: int64

In [8]:
# явные индексы
s2['a']

100

In [9]:
s2[0]

50

In [10]:
# атрибут-индексатор для явных индексов
s2.loc[0]

50

In [11]:
# неявные индексы
# атрибут-индексатор для НЕявных индексов
s2.iloc[0]

100

In [12]:
s2.iloc[-1]

50

In [13]:
s2.shape

(3,)

In [14]:
s2.size

3

In [15]:
s2.values

array([100,  20,  50], dtype=int64)

In [16]:
# Данные по населению городов Якутии 
pop_dict = {
    "Yakutsk": 318768, 
    "Neryungri": 57009,
    "Viliuysk": 11095
}

pop_s = pd.Series(pop_dict)
pop_s

Yakutsk      318768
Neryungri     57009
Viliuysk      11095
dtype: int64

In [23]:
# вывести население выбранного города (Якутск)
pop_s["Yakutsk"]

318768

In [22]:
# срез от Нерюнгри до Вилюйска
# конечный явный индекс включается в диапазон среза
pop_s["Neryungri":"Viliuysk"]

Neryungri    57009
Viliuysk     11095
dtype: int64

- Объект класса (типа) DataFrame

In [24]:
data = pd.read_csv("data/president_heights (1).csv")
data

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
1,2,John Adams,170
2,3,Thomas Jefferson,189
3,4,James Madison,163
4,5,James Monroe,183
5,6,John Quincy Adams,171
6,7,Andrew Jackson,185
7,8,Martin Van Buren,168
8,9,William Henry Harrison,173
9,10,John Tyler,183


In [25]:
d2 = {
    "col_1": {"row_1": 100, "row_2": 200},
    "col_2": {"row_2": 400, "row_3": 500}
}
df1 = pd.DataFrame(d2)
df1

Unnamed: 0,col_1,col_2
row_1,100.0,
row_2,200.0,400.0
row_3,,500.0


In [26]:
# извлекаем данные из столбца
df1["col_1"]

row_1    100.0
row_2    200.0
row_3      NaN
Name: col_1, dtype: float64

In [28]:
# данные из строки 
df1.loc["row_2", :]

col_1    200.0
col_2    400.0
Name: row_2, dtype: float64

In [29]:
df1.loc["row_2", "col_2"]

400.0

In [30]:
data.head()

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
1,2,John Adams,170
2,3,Thomas Jefferson,189
3,4,James Madison,163
4,5,James Monroe,183


In [31]:
data.head(7)

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
1,2,John Adams,170
2,3,Thomas Jefferson,189
3,4,James Madison,163
4,5,James Monroe,183
5,6,John Quincy Adams,171
6,7,Andrew Jackson,185


In [32]:
data.index

RangeIndex(start=0, stop=42, step=1)

In [34]:
data.columns

Index(['order', 'name', 'height(cm)'], dtype='object')

In [35]:
data["name"]

0          George Washington
1                 John Adams
2           Thomas Jefferson
3              James Madison
4               James Monroe
5          John Quincy Adams
6             Andrew Jackson
7           Martin Van Buren
8     William Henry Harrison
9                 John Tyler
10             James K. Polk
11            Zachary Taylor
12          Millard Fillmore
13           Franklin Pierce
14            James Buchanan
15           Abraham Lincoln
16            Andrew Johnson
17          Ulysses S. Grant
18       Rutherford B. Hayes
19         James A. Garfield
20         Chester A. Arthur
21         Benjamin Harrison
22          William McKinley
23        Theodore Roosevelt
24       William Howard Taft
25            Woodrow Wilson
26         Warren G. Harding
27           Calvin Coolidge
28            Herbert Hoover
29     Franklin D. Roosevelt
30           Harry S. Truman
31      Dwight D. Eisenhower
32           John F. Kennedy
33         Lyndon B. Johnson
34            

In [36]:
data.values

array([[1, 'George Washington', 189],
       [2, 'John Adams', 170],
       [3, 'Thomas Jefferson', 189],
       [4, 'James Madison', 163],
       [5, 'James Monroe', 183],
       [6, 'John Quincy Adams', 171],
       [7, 'Andrew Jackson', 185],
       [8, 'Martin Van Buren', 168],
       [9, 'William Henry Harrison', 173],
       [10, 'John Tyler', 183],
       [11, 'James K. Polk', 173],
       [12, 'Zachary Taylor', 173],
       [13, 'Millard Fillmore', 175],
       [14, 'Franklin Pierce', 178],
       [15, 'James Buchanan', 183],
       [16, 'Abraham Lincoln', 193],
       [17, 'Andrew Johnson', 178],
       [18, 'Ulysses S. Grant', 173],
       [19, 'Rutherford B. Hayes', 174],
       [20, 'James A. Garfield', 183],
       [21, 'Chester A. Arthur', 183],
       [23, 'Benjamin Harrison', 168],
       [25, 'William McKinley', 170],
       [26, 'Theodore Roosevelt', 178],
       [27, 'William Howard Taft', 182],
       [28, 'Woodrow Wilson', 180],
       [29, 'Warren G. Harding', 183

In [37]:
data["name"].values

array(['George Washington', 'John Adams', 'Thomas Jefferson',
       'James Madison', 'James Monroe', 'John Quincy Adams',
       'Andrew Jackson', 'Martin Van Buren', 'William Henry Harrison',
       'John Tyler', 'James K. Polk', 'Zachary Taylor',
       'Millard Fillmore', 'Franklin Pierce', 'James Buchanan',
       'Abraham Lincoln', 'Andrew Johnson', 'Ulysses S. Grant',
       'Rutherford B. Hayes', 'James A. Garfield', 'Chester A. Arthur',
       'Benjamin Harrison', 'William McKinley', 'Theodore Roosevelt',
       'William Howard Taft', 'Woodrow Wilson', 'Warren G. Harding',
       'Calvin Coolidge', 'Herbert Hoover', 'Franklin D. Roosevelt',
       'Harry S. Truman', 'Dwight D. Eisenhower', 'John F. Kennedy',
       'Lyndon B. Johnson', 'Richard Nixon', 'Gerald Ford',
       'Jimmy Carter', 'Ronald Reagan', 'George H. W. Bush',
       'Bill Clinton', 'George W. Bush', 'Barack Obama'], dtype=object)

In [38]:
data["height(cm)"].min(), data["height(cm)"].max()


(163, 193)

In [39]:
# маскирование 
mask = data["height(cm)"] > 185
mask

0      True
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15     True
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29     True
30    False
31    False
32    False
33     True
34    False
35    False
36    False
37    False
38     True
39     True
40    False
41    False
Name: height(cm), dtype: bool

In [40]:
data.loc[mask, :]

Unnamed: 0,order,name,height(cm)
0,1,George Washington,189
2,3,Thomas Jefferson,189
15,16,Abraham Lincoln,193
29,32,Franklin D. Roosevelt,188
33,36,Lyndon B. Johnson,193
38,41,George H. W. Bush,188
39,42,Bill Clinton,188


In [41]:
data.loc[data["height(cm)"] < 170, :]

Unnamed: 0,order,name,height(cm)
3,4,James Madison,163
7,8,Martin Van Buren,168
21,23,Benjamin Harrison,168


In [42]:
pop_s.index

Index(['Yakutsk', 'Neryungri', 'Viliuysk'], dtype='object')

In [43]:
# данные по площадям городов
area_s = pd.Series({
    "Yakutsk": 122,
    "Viliuysk": 15,
    "Neryungri": 98.9

})
area_s

Yakutsk      122.0
Viliuysk      15.0
Neryungri     98.9
dtype: float64

In [44]:
ykt_df = pd.DataFrame({
    "population": pop_s,
    "area": area_s
})
ykt_df

Unnamed: 0,population,area
Neryungri,57009,98.9
Viliuysk,11095,15.0
Yakutsk,318768,122.0


In [45]:
ykt_df["плотность населения"] = ykt_df["population"] / ykt_df["area"]
ykt_df

Unnamed: 0,population,area,плотность населения
Neryungri,57009,98.9,576.430738
Viliuysk,11095,15.0,739.666667
Yakutsk,318768,122.0,2612.852459
