In [36]:
import pandas as pd
import numpy as np

# Создание DataFrame

In [2]:
df = pd.DataFrame(data = [
        ['Anna', 23, 3],
        ['Sam', 36, 12],
        ['Bill', 33, 10],
        ['Monica', 25, 7],
        ['Lisa', 27, 7],
        ['Peter', 32, None],
    ], columns=['name', 'age', 'score']
)

In [3]:
df

Unnamed: 0,name,age,score
0,Anna,23,3.0
1,Sam,36,12.0
2,Bill,33,10.0
3,Monica,25,7.0
4,Lisa,27,7.0
5,Peter,32,


In [4]:
print(
    df['name'].__repr__(),'\n',type(df['name']), '\n\n',
    df.__repr__(), '\n', type(df),
)

0      Anna
1       Sam
2      Bill
3    Monica
4      Lisa
5     Peter
Name: name, dtype: object 
 <class 'pandas.core.series.Series'> 

      name  age  score
0    Anna   23    3.0
1     Sam   36   12.0
2    Bill   33   10.0
3  Monica   25    7.0
4    Lisa   27    7.0
5   Peter   32    NaN 
 <class 'pandas.core.frame.DataFrame'>


In [5]:
df.columns = ['name', 'age', 'expr']
df

Unnamed: 0,name,age,expr
0,Anna,23,3.0
1,Sam,36,12.0
2,Bill,33,10.0
3,Monica,25,7.0
4,Lisa,27,7.0
5,Peter,32,


# Индексы и метод `.iloc`

 работает на основе целочисленного позиционирования

In [6]:
df.iloc[1, 2]

12.0

In [7]:
print(df.iloc[1:3, 1])
print()
print(df.iloc[:, 0])
print()
print(df.iloc[1:3, :])
print()
print(df['name'])
print()
print(df[['name', 'expr']])

1    36
2    33
Name: age, dtype: int64

0      Anna
1       Sam
2      Bill
3    Monica
4      Lisa
5     Peter
Name: name, dtype: object

   name  age  expr
1   Sam   36  12.0
2  Bill   33  10.0

0      Anna
1       Sam
2      Bill
3    Monica
4      Lisa
5     Peter
Name: name, dtype: object

     name  expr
0    Anna   3.0
1     Sam  12.0
2    Bill  10.0
3  Monica   7.0
4    Lisa   7.0
5   Peter   NaN


# Индексы и метод `.loc`

использует именованные индексы


In [8]:
df.loc[1:4, ['name', 'expr']]

Unnamed: 0,name,expr
1,Sam,12.0
2,Bill,10.0
3,Monica,7.0
4,Lisa,7.0


In [9]:
df2 = df.copy()

In [10]:
df2.index = df2.name
df2

Unnamed: 0_level_0,name,age,expr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anna,Anna,23,3.0
Sam,Sam,36,12.0
Bill,Bill,33,10.0
Monica,Monica,25,7.0
Lisa,Lisa,27,7.0
Peter,Peter,32,


In [11]:
df2.loc['Sam':'Lisa', :]

Unnamed: 0_level_0,name,age,expr
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sam,Sam,36,12.0
Bill,Bill,33,10.0
Monica,Monica,25,7.0
Lisa,Lisa,27,7.0


In [12]:
df2.loc['Sam', 'age']

36

In [13]:

df[df['age']>30]

Unnamed: 0,name,age,expr
1,Sam,36,12.0
2,Bill,33,10.0
5,Peter,32,


# Разница между `.iloc` & `.loc`

* [Using Pandas iloc, loc, & ix to select rows and columns in DataFrames]([https://link](https://coderoad.ru/31593201/%D0%A7%D0%B5%D0%BC-%D0%BE%D1%82%D0%BB%D0%B8%D1%87%D0%B0%D1%8E%D1%82%D1%81%D1%8F-iloc-%D0%B8-loc))
* [Чем отличаются iloc и loc?](https://coderoad.ru/31593201/%D0%A7%D0%B5%D0%BC-%D0%BE%D1%82%D0%BB%D0%B8%D1%87%D0%B0%D1%8E%D1%82%D1%81%D1%8F-iloc-%D0%B8-loc)


# Характеристики датафрейма pandas


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    6 non-null      object 
 1   age     6 non-null      int64  
 2   expr    5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes


In [15]:
df.shape

(6, 3)

In [16]:
df.describe()

Unnamed: 0,age,expr
count,6.0,5.0
mean,29.333333,7.8
std,5.085928,3.420526
min,23.0,3.0
25%,25.5,7.0
50%,29.5,7.0
75%,32.75,10.0
max,36.0,12.0


In [17]:
df.columns

Index(['name', 'age', 'expr'], dtype='object')

In [18]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [19]:
df2.index

Index(['Anna', 'Sam', 'Bill', 'Monica', 'Lisa', 'Peter'], dtype='object', name='name')

# Операции над датафреймами.

## Часть 1


In [20]:
df.head(4)

Unnamed: 0,name,age,expr
0,Anna,23,3.0
1,Sam,36,12.0
2,Bill,33,10.0
3,Monica,25,7.0


In [21]:
df.tail(4)

Unnamed: 0,name,age,expr
2,Bill,33,10.0
3,Monica,25,7.0
4,Lisa,27,7.0
5,Peter,32,


In [22]:
df_na = df.dropna()
df_na

Unnamed: 0,name,age,expr
0,Anna,23,3.0
1,Sam,36,12.0
2,Bill,33,10.0
3,Monica,25,7.0
4,Lisa,27,7.0


In [23]:
df_na[df_na['age'] > 30]

Unnamed: 0,name,age,expr
1,Sam,36,12.0
2,Bill,33,10.0


In [24]:
df_na[(df_na['age'] > 20) & (df_na['expr'] < 10)]

Unnamed: 0,name,age,expr
0,Anna,23,3.0
3,Monica,25,7.0
4,Lisa,27,7.0


## Часть 2

In [25]:
df['age_sq'] = df['age'] ** 2

In [26]:
df['status'] = 'W'

In [27]:
df['gender'] = [0, 1, 1, 0, 0, 1]

In [28]:
df['no_work'] = df['age'] - df['expr']

In [29]:
df

Unnamed: 0,name,age,expr,age_sq,status,gender,no_work
0,Anna,23,3.0,529,W,0,20.0
1,Sam,36,12.0,1296,W,1,24.0
2,Bill,33,10.0,1089,W,1,23.0
3,Monica,25,7.0,625,W,0,18.0
4,Lisa,27,7.0,729,W,0,20.0
5,Peter,32,,1024,W,1,


# Применение функций и метод `.apply ()`


In [30]:
df_WL = pd.read_csv('WeightLoss.csv')
df_WL.head()

Unnamed: 0,id,group,w1,w2,w3,se1,se2,se3
0,1,Control,4,3,3.0,14.0,13.0,15.0
1,2,Control,4,4,3.0,13.0,14.0,17.0
2,3,Control,4,3,1.0,17.0,12.0,16.0
3,4,Control,3,2,1.0,11.0,11.0,12.0
4,5,Control,5,3,2.0,16.0,15.0,14.0


In [47]:
df_WL['total_weight'] = df_WL.loc[:, 'w1':'w3'].apply(np.sum, axis=1)
df_WL['avloss'] = df_WL.loc[:, 'w1':'w3'].apply(np.mean, axis=1)
df_WL['wrange'] = df_WL.loc[:, 'w1':'w3'].apply(lambda x: x.max() - x.min(), axis=1)

In [49]:
df_WL.head(7)

Unnamed: 0,id,group,w1,w2,w3,se1,se2,se3,total_weight,avloss,wrange
0,1,Control,4,3,3.0,14.0,13.0,15.0,10.0,3.333333,1.0
1,2,Control,4,4,3.0,13.0,14.0,17.0,11.0,3.666667,1.0
2,3,Control,4,3,1.0,17.0,12.0,16.0,8.0,2.666667,3.0
3,4,Control,3,2,1.0,11.0,11.0,12.0,6.0,2.0,2.0
4,5,Control,5,3,2.0,16.0,15.0,14.0,10.0,3.333333,3.0
5,6,Control,6,5,4.0,17.0,18.0,18.0,15.0,5.0,2.0
6,7,Control,6,5,4.0,17.0,16.0,19.0,15.0,5.0,2.0


# Группировка и агрегирование 

In [56]:
list(df_WL.groupby('group'))

[('Control',
      id    group  w1  w2   w3   se1   se2   se3  total_weight    avloss  wrange
  0    1  Control   4   3  3.0  14.0  13.0  15.0          10.0  3.333333     1.0
  1    2  Control   4   4  3.0  13.0  14.0  17.0          11.0  3.666667     1.0
  2    3  Control   4   3  1.0  17.0  12.0  16.0           8.0  2.666667     3.0
  3    4  Control   3   2  1.0  11.0  11.0  12.0           6.0  2.000000     2.0
  4    5  Control   5   3  2.0  16.0  15.0  14.0          10.0  3.333333     3.0
  5    6  Control   6   5  4.0  17.0  18.0  18.0          15.0  5.000000     2.0
  6    7  Control   6   5  4.0  17.0  16.0  19.0          15.0  5.000000     2.0
  7    8  Control   5   4  1.0   NaN   NaN   NaN          10.0  3.333333     4.0
  8    9  Control   5   4  1.0  14.0  14.0  15.0          10.0  3.333333     4.0
  9   10  Control   3   3  2.0  14.0  15.0  13.0           8.0  2.666667     1.0
  10  11  Control   4   2  2.0  16.0  16.0  11.0           8.0  2.666667     2.0
  11  12  Contr

In [57]:
df_WL.groupby('group').agg('mean')

Unnamed: 0_level_0,id,w1,w2,w3,se1,se2,se3,total_weight,avloss,wrange
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Control,6.5,4.5,3.333333,2.083333,14.909091,14.272727,15.090909,9.916667,3.305556,2.416667
Diet,18.5,5.333333,3.916667,2.25,14.833333,13.75,16.166667,11.5,3.833333,3.083333
DietEx,29.5,6.2,6.1,2.333333,15.2,13.3,17.666667,14.4,4.983333,4.5


In [66]:
df_WL.loc[:, 'group':'w3'].groupby('group').agg(['min', 'max', 'mean'])

Unnamed: 0_level_0,w1,w1,w1,w2,w2,w2,w3,w3,w3
Unnamed: 0_level_1,min,max,mean,min,max,mean,min,max,mean
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Control,3,6,4.5,2,5,3.333333,1.0,4.0,2.083333
Diet,3,7,5.333333,2,6,3.916667,1.0,4.0,2.25
DietEx,3,9,6.2,4,9,6.1,1.0,4.0,2.333333


# Сортировка и упорядочение

In [67]:
df_WL.sort_values('total_weight')

Unnamed: 0,id,group,w1,w2,w3,se1,se2,se3,total_weight,avloss,wrange
16,17,Diet,3,2,1.0,16.0,17.0,15.0,6.0,2.0,2.0
3,4,Control,3,2,1.0,11.0,11.0,12.0,6.0,2.0,2.0
19,20,Diet,4,2,1.0,12.0,11.0,11.0,7.0,2.333333,3.0
2,3,Control,4,3,1.0,17.0,12.0,16.0,8.0,2.666667,3.0
27,28,DietEx,3,4,1.0,16.0,13.0,,8.0,2.666667,3.0
18,19,Diet,4,3,1.0,12.0,11.0,14.0,8.0,2.666667,3.0
9,10,Control,3,3,2.0,14.0,15.0,13.0,8.0,2.666667,1.0
10,11,Control,4,2,2.0,16.0,16.0,11.0,8.0,2.666667,2.0
11,12,Control,5,2,1.0,15.0,13.0,16.0,8.0,2.666667,4.0
28,29,DietEx,3,5,1.0,13.0,13.0,16.0,9.0,3.0,4.0


In [71]:
df_WL.sort_values(['wrange', 'total_weight', 'se3'], ascending=[0, 0, 1])

Unnamed: 0,id,group,w1,w2,w3,se1,se2,se3,total_weight,avloss,wrange
31,32,DietEx,9,5,2.0,16.0,14.0,17.0,16.0,5.333333,7.0
33,34,DietEx,8,6,1.0,17.0,17.0,17.0,15.0,5.0,7.0
26,27,DietEx,9,7,3.0,13.0,12.0,17.0,19.0,6.333333,6.0
32,33,DietEx,7,9,4.0,16.0,16.0,19.0,20.0,6.666667,5.0
14,15,Diet,7,6,3.0,17.0,11.0,18.0,16.0,5.333333,4.0
23,24,Diet,7,4,3.0,16.0,14.0,18.0,14.0,4.666667,4.0
29,30,DietEx,6,5,2.0,15.0,12.0,18.0,13.0,4.333333,4.0
15,16,Diet,6,4,2.0,16.0,15.0,18.0,12.0,4.0,4.0
12,13,Diet,6,3,2.0,12.0,11.0,14.0,11.0,3.666667,4.0
8,9,Control,5,4,1.0,14.0,14.0,15.0,10.0,3.333333,4.0


# Работа с NaN-ами

In [77]:
df_WL.isnull().sum()

id              0
group           0
w1              0
w2              0
w3              1
se1             1
se2             1
se3             2
total_weight    0
avloss          0
wrange          0
dtype: int64

In [79]:
df_WL.se3.fillna(0, inplace=True)
df_WL.isnull().sum()

id              0
group           0
w1              0
w2              0
w3              1
se1             1
se2             1
se3             0
total_weight    0
avloss          0
wrange          0
dtype: int64

In [82]:
df_WL.dropna(inplace=True)
df_WL.isnull().sum()

id              0
group           0
w1              0
w2              0
w3              0
se1             0
se2             0
se3             0
total_weight    0
avloss          0
wrange          0
dtype: int64

# Иерархическое индексирование.

## Часть 1