### Series

In [20]:
import pandas as pd
import numpy as np
from datetime import date

In [48]:
a = np.arange(0,30,2)
np.random.shuffle(a)
a

array([10, 20, 18, 24,  2,  6, 14,  4,  8,  0, 22, 16, 26, 28, 12])

In [14]:
b = pd.Series(a)
b

0    8
1    4
2    0
3    6
4    2
dtype: int32

In [15]:
b.index

RangeIndex(start=0, stop=5, step=1)

In [26]:
ind = [date(y,m,d) for y,m,d in [(2018, 1, 15), (2018, 4,19), (2018,3,24),(2018,7,8),(2018,10,10)]]
b = pd.Series(a, index=ind)
b

2018-01-15    8
2018-04-19    4
2018-03-24    0
2018-07-08    6
2018-10-10    2
dtype: int32

In [28]:
b.index

Index([2018-01-15, 2018-04-19, 2018-03-24, 2018-07-08, 2018-10-10], dtype='object')

In [30]:
b.index = pd.to_datetime(b.index, format = '%Y-%m-%d')

In [31]:
b.index

DatetimeIndex(['2018-01-15', '2018-04-19', '2018-03-24', '2018-07-08',
               '2018-10-10'],
              dtype='datetime64[ns]', freq=None)

In [32]:
b = pd.Series(a, dtype = np.float64)

In [33]:
b

0    8.0
1    4.0
2    0.0
3    6.0
4    2.0
dtype: float64

In [34]:
b = pd.Series(a)
b = b.astype(np.float64)
b

0    8.0
1    4.0
2    0.0
3    6.0
4    2.0
dtype: float64

In [35]:
b.values

array([8., 4., 0., 6., 2.])

In [36]:
b[[0,2]]

0    8.0
2    0.0
dtype: float64

In [39]:
b.head(2)

0    8.0
1    4.0
dtype: float64

In [40]:
b.tail(3)

2    0.0
3    6.0
4    2.0
dtype: float64

In [41]:
b[b > 5]

0    8.0
3    6.0
dtype: float64

In [43]:
b[(b == 2) | (b > 7)]

0    8.0
4    2.0
dtype: float64

In [45]:
b[(b > 5) & (b < 9)]

0    8.0
3    6.0
dtype: float64

In [46]:
b[b < 5] = 0

In [51]:
b = pd.Series(a)
b

0     10
1     20
2     18
3     24
4      2
5      6
6     14
7      4
8      8
9      0
10    22
11    16
12    26
13    28
14    12
dtype: int32

In [52]:
b[[0,3,7]] = 1

In [53]:
b

0      1
1     20
2     18
3      1
4      2
5      6
6     14
7      1
8      8
9      0
10    22
11    16
12    26
13    28
14    12
dtype: int32

In [54]:
b = b.append(pd.Series(a[0:5]))

In [56]:
b.head(2)

0     1
1    20
dtype: int32

In [57]:
b.tail(3)

2    18
3     1
4     2
dtype: int32

In [58]:
b.count()

20

### DataFrame

In [59]:
df = pd.DataFrame({'Col1':['a','b','c','d','e','f','g','h'],
                   'Col2':[1,2,3,4,5,6,7,8]}, columns=['Col1','Col2'])
df

Unnamed: 0,Col1,Col2
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5
5,f,6
6,g,7
7,h,8


In [60]:
df.shape

(8, 2)

In [61]:
df.columns

Index(['Col1', 'Col2'], dtype='object')

In [62]:
df.index

RangeIndex(start=0, stop=8, step=1)

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
Col1    8 non-null object
Col2    8 non-null int64
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [64]:
df.describe()

Unnamed: 0,Col2
count,8.0
mean,4.5
std,2.44949
min,1.0
25%,2.75
50%,4.5
75%,6.25
max,8.0


In [65]:
df.head()

Unnamed: 0,Col1,Col2
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [66]:
df.head(2)

Unnamed: 0,Col1,Col2
0,a,1
1,b,2


In [67]:
df['Col1'].head(2)

0    a
1    b
Name: Col1, dtype: object

In [68]:
df.Col1.tail(2)

6    g
7    h
Name: Col1, dtype: object

In [79]:
df.index = [np.arange(2,17,2)]

In [80]:
df.loc[2, 'Col1']

2    a
Name: Col1, dtype: object

In [81]:
df.loc[2]

Unnamed: 0,Col1,Col2
2,a,1


In [87]:
df.loc[0:2]

Unnamed: 0,Col1,Col2
2,a,1


In [88]:
df.iloc[0]

Col1    a
Col2    1
Name: (2,), dtype: object

In [89]:
df.iloc[0:2,]

Unnamed: 0,Col1,Col2
2,a,1
4,b,2


In [91]:
df.loc[df['Col1'] == 'b']

Unnamed: 0,Col1,Col2
4,b,2


In [93]:
df.loc[df['Col1'] == 'b'].values

array([['b', 2]], dtype=object)

In [104]:
df.loc[df.Col2.between(3,6)]

Unnamed: 0,Col1,Col2
6,c,3
8,d,4
10,e,5
12,f,6


In [105]:
df.loc[~df.Col2.isin(np.arange(0,5))]

Unnamed: 0,Col1,Col2
10,e,5
12,f,6
14,g,7
16,h,8


In [107]:
df.query('Col1 == "b"')

Unnamed: 0,Col1,Col2
4,b,2


In [109]:
df.query('Col2 > 5')

Unnamed: 0,Col1,Col2
12,f,6
14,g,7
16,h,8


In [111]:
s = df['Col1']
s

2     a
4     b
6     c
8     d
10    e
12    f
14    g
16    h
Name: Col1, dtype: object

In [112]:
type(s)

pandas.core.series.Series

In [113]:
df.sample(n=3)

Unnamed: 0,Col1,Col2
12,f,6
8,d,4
16,h,8


In [114]:
df.sample(frac=0.3)

Unnamed: 0,Col1,Col2
8,d,4
4,b,2


In [152]:
df.sample(frac=0.7,replace=True)

Unnamed: 0,Col1,Col2
2,a,1
4,b,2
8,d,4
8,d,4
2,a,1
16,h,8


In [160]:
df.sample(frac = 1.0, random_state=42)

Unnamed: 0,Col1,Col2
4,b,2
12,f,6
2,a,1
16,h,8
6,c,3
10,e,5
8,d,4
14,g,7


In [171]:
df.to_csv('Test.csv', sep = ';')

In [179]:
df_new = pd.read_csv('Test.csv', sep = ';', index_col=0)

In [180]:
df_new

Unnamed: 0,Col1,Col2
2,a,1
4,b,2
6,c,3
8,d,4
10,e,5
12,f,6
14,g,7
16,h,8


### Работа с DF

In [181]:
authors = pd.DataFrame({'author_id': [1,2,3],
                        'author_name':['Pushkin', 'Tolstoy', 'Dostoevski']},
                       columns=['author_id','author_name'])
authors

Unnamed: 0,author_id,author_name
0,1,Pushkin
1,2,Tolstoy
2,3,Dostoevski


In [182]:
books = pd.DataFrame({'author_id': [2,3,3,4],
                      'book_title':['War and Peace', 'The Idiot', 'Crime and Punishment', 'Fathers and Sons']})

In [183]:
books

Unnamed: 0,author_id,book_title
0,2,War and Peace
1,3,The Idiot
2,3,Crime and Punishment
3,4,Fathers and Sons


In [192]:
df2 = pd.merge(authors,books,on='author_id', how='left')
df2

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,
1,2,Tolstoy,War and Peace
2,3,Dostoevski,The Idiot
3,3,Dostoevski,Crime and Punishment


In [193]:
df3 = pd.merge(authors,books,on='author_id', how='right')
df3

Unnamed: 0,author_id,author_name,book_title
0,2,Tolstoy,War and Peace
1,3,Dostoevski,The Idiot
2,3,Dostoevski,Crime and Punishment
3,4,,Fathers and Sons


In [194]:
df1 = pd.merge(authors,books,on='author_id', how='inner')
df1

Unnamed: 0,author_id,author_name,book_title
0,2,Tolstoy,War and Peace
1,3,Dostoevski,The Idiot
2,3,Dostoevski,Crime and Punishment


In [210]:
df4 = pd.merge(authors,books,on='author_id', how='outer')

In [211]:
df4.loc[df4.book_title.isnull()]

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,


In [212]:
df4.loc[df4.author_name.notnull()]

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,
1,2,Tolstoy,War and Peace
2,3,Dostoevski,The Idiot
3,3,Dostoevski,Crime and Punishment


In [213]:
df4.book_title=df4.book_title.fillna('unknown');
df4

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,unknown
1,2,Tolstoy,War and Peace
2,3,Dostoevski,The Idiot
3,3,Dostoevski,Crime and Punishment
4,4,,Fathers and Sons


In [214]:
df4.author_name = df4.author_name.fillna('unknown')
df4

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,unknown
1,2,Tolstoy,War and Peace
2,3,Dostoevski,The Idiot
3,3,Dostoevski,Crime and Punishment
4,4,unknown,Fathers and Sons


In [215]:
df4.loc[(df4.author_name != 'unknown') & (df4.book_title != 'unknown'), 'quantity'] = 1
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,
1,2,Tolstoy,War and Peace,1.0
2,3,Dostoevski,The Idiot,1.0
3,3,Dostoevski,Crime and Punishment,1.0
4,4,unknown,Fathers and Sons,


In [216]:
df4.quantity.fillna(0, inplace = True)

In [217]:
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,0.0
1,2,Tolstoy,War and Peace,1.0
2,3,Dostoevski,The Idiot,1.0
3,3,Dostoevski,Crime and Punishment,1.0
4,4,unknown,Fathers and Sons,0.0


In [218]:
df4.quantity = df4.quantity.astype(int)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,0
1,2,Tolstoy,War and Peace,1
2,3,Dostoevski,The Idiot,1
3,3,Dostoevski,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [219]:
df4.set_index('author_id', inplace = True)

In [220]:
df4

Unnamed: 0_level_0,author_name,book_title,quantity
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Pushkin,unknown,0
2,Tolstoy,War and Peace,1
3,Dostoevski,The Idiot,1
3,Dostoevski,Crime and Punishment,1
4,unknown,Fathers and Sons,0


In [222]:
df4.reset_index(inplace = True)

In [223]:
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,0
1,2,Tolstoy,War and Peace,1
2,3,Dostoevski,The Idiot,1
3,3,Dostoevski,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [225]:
df['price'] = 500
df

Unnamed: 0,Col1,Col2,price
2,a,1,500
4,b,2,500
6,c,3,500
8,d,4,500
10,e,5,500
12,f,6,500
14,g,7,500
16,h,8,500


In [226]:
df.drop('price', axis=1)

Unnamed: 0,Col1,Col2
2,a,1
4,b,2
6,c,3
8,d,4
10,e,5
12,f,6
14,g,7
16,h,8


In [227]:
df['price'] = 500
df

Unnamed: 0,Col1,Col2,price
2,a,1,500
4,b,2,500
6,c,3,500
8,d,4,500
10,e,5,500
12,f,6,500
14,g,7,500
16,h,8,500


In [229]:
df.drop(2, axis=0)

Unnamed: 0,Col1,Col2,price
4,b,2,500
6,c,3,500
8,d,4,500
10,e,5,500
12,f,6,500
14,g,7,500
16,h,8,500


In [230]:
df4.sort_values(by='author_id')

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,0
1,2,Tolstoy,War and Peace,1
2,3,Dostoevski,The Idiot,1
3,3,Dostoevski,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [231]:
df4 = df4.reset_index(drop=True)

In [232]:
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,0
1,2,Tolstoy,War and Peace,1
2,3,Dostoevski,The Idiot,1
3,3,Dostoevski,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [249]:
hh = np.linspace(200,800,5)
hh

array([200., 350., 500., 650., 800.])

In [251]:
np.random.shuffle(hh)
hh

array([200., 350., 650., 800., 500.])

In [252]:
df4['price'] = hh
df4

Unnamed: 0,author_id,author_name,book_title,quantity,price
0,1,Pushkin,unknown,0,200.0
1,2,Tolstoy,War and Peace,1,350.0
2,3,Dostoevski,The Idiot,1,650.0
3,3,Dostoevski,Crime and Punishment,1,800.0
4,4,unknown,Fathers and Sons,0,500.0


In [254]:
df4['total'] = df4.quantity * df4.price
df4

Unnamed: 0,author_id,author_name,book_title,quantity,price,total
0,1,Pushkin,unknown,0,200.0,0
1,2,Tolstoy,War and Peace,1,350.0,500
2,3,Dostoevski,The Idiot,1,650.0,500
3,3,Dostoevski,Crime and Punishment,1,800.0,500
4,4,unknown,Fathers and Sons,0,500.0,0


In [256]:
df4['price'].max()

800.0

In [257]:
df4['price'].min()

200.0

In [258]:
df4['price'].mean()

500.0

In [259]:
df4['price'].median()

500.0

In [260]:
df4['price'].std()

237.17082451262846

In [261]:
df4['price'].var()

56250.0

In [262]:
df4.nlargest(2, 'price')

Unnamed: 0,author_id,author_name,book_title,quantity,price,total
3,3,Dostoevski,Crime and Punishment,1,800.0,500
2,3,Dostoevski,The Idiot,1,650.0,500


In [263]:
df4.author_name.unique()

array(['Pushkin', 'Tolstoy', 'Dostoevski', 'unknown'], dtype=object)

In [264]:
df4.author_name.nunique()

4

In [265]:
df4.author_name.value_counts()

Dostoevski    2
unknown       1
Pushkin       1
Tolstoy       1
Name: author_name, dtype: int64

In [268]:
df4.groupby('author_name')['price'].max()

author_name
Dostoevski    800.0
Pushkin       200.0
Tolstoy       350.0
unknown       500.0
Name: price, dtype: float64

In [271]:
price_agg = df4.groupby('author_name').agg({'price': 'max'})
price_agg

Unnamed: 0_level_0,price
author_name,Unnamed: 1_level_1
Dostoevski,800.0
Pushkin,200.0
Tolstoy,350.0
unknown,500.0


In [272]:
price_agg = price_agg.rename(columns={'price':'max_price'})
price_agg

Unnamed: 0_level_0,max_price
author_name,Unnamed: 1_level_1
Dostoevski,800.0
Pushkin,200.0
Tolstoy,350.0
unknown,500.0
