# pandas practice

In [12]:
import pandas as pd
import re

In [13]:
movies = pd.read_csv('data/movies.csv', index_col=False)
movies.head()

Unnamed: 0,userId,movieId,rating,date,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### task 0 
* append new column 'year_release' that corresponds to the year the movie was released.
* how many movies don't have a release date?

In [14]:
def get_year_release(arg):
    candidates = re.findall(r'\(\d{4}\)', arg) 
    if len(candidates) > 0:
        year = candidates[0].replace('(', '')
        year = year.replace(')', '')
        return int(year)
    else:
        return None

In [15]:
movies['year_release'] = movies['title'].apply(get_year_release)
movies['year_release'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 100836 entries, 0 to 100835
Series name: year_release
Non-Null Count   Dtype  
--------------   -----  
100818 non-null  float64
dtypes: float64(1)
memory usage: 787.9 KB


# task 1
* which movie 1999 released, is the lowest average rating?

In [16]:
movies[movies['year_release'] == 1999].groupby(by='title')['rating'].mean().sort_values(ascending=True)

title
Bloodsport: The Dark Kumite (1999)                  0.5
Trippin' (1999)                                     1.0
Chill Factor (1999)                                 1.0
From Dusk Till Dawn 2: Texas Blood Money (1999)     1.0
Simon Sez (1999)                                    1.0
                                                   ... 
Mickey's Once Upon a Christmas (1999)               5.0
On the Ropes (1999)                                 5.0
Trailer Park Boys (1999)                            5.0
Five Senses, The (1999)                             5.0
George Carlin: You Are All Diseased (1999)          5.0
Name: rating, Length: 261, dtype: float64

# task 2 
* which ganre of movies released in 2010 is the lowest average rating?

In [17]:
movies[movies['year_release'] == 2010].groupby(by='genres')['rating'].mean().sort_values(ascending=True)

genres
Action|Sci-Fi                        1.000000
Action|Adventure|Horror              1.500000
Action|Drama|Fantasy                 1.500000
Crime|Romance                        1.500000
Adventure|Comedy|Fantasy             1.833333
                                       ...   
Crime                                4.750000
Adventure|Children|Comedy|Mystery    5.000000
Animation|Children|Mystery           5.000000
Animation|Drama|Fantasy|Mystery      5.000000
Comedy|Musical                       5.000000
Name: rating, Length: 119, dtype: float64

# task 3
* which user has watched the most different combinations of genres of films?

In [18]:
movies.groupby(by='userId')['genres'].nunique().sort_values(ascending=False)

userId
599    524
414    482
448    403
380    399
474    395
      ... 
494     15
578     15
214     13
85      13
245     13
Name: genres, Length: 610, dtype: int64

# task 4
* which user gave the least number of ratings, and his average rating for movie is the highest?

In [19]:
movies.groupby(by='userId')['rating'].agg(['count', 'mean']).sort_values(by=['count', 'mean'], ascending=[True, False])

Unnamed: 0_level_0,count,mean
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
53,20,5.000000
595,20,4.200000
189,20,4.100000
569,20,4.000000
278,20,3.875000
...,...,...
274,1346,3.235884
448,1864,2.847371
474,2108,3.398956
599,2478,2.642050


# task 5
* which genres have the highest average rating for 2018, and the number of ratings exceeds 10?

In [20]:
av_genres = movies[movies['year_release'] == 2018].groupby(by='genres')['rating'].agg(['mean', 'count'])
av_genres[av_genres['count'] > 10].groupby(by='genres')['mean'].value_counts(ascending=True)

genres                   mean    
Action|Adventure|Sci-Fi  3.928571    1
Action|Comedy|Sci-Fi     3.875000    1
Name: count, dtype: int64

# task 6

In [21]:
movies['date'] = pd.to_datetime(movies['date'])
movies['year_rating'] = movies['date'].dt.year

In [22]:
mov = movies.pivot_table(
    values='rating',
    index='genres',
    columns='year_rating',
    aggfunc='mean'
)
mov.describe()
mov[1999].sort_values(ascending=False).head(15)
# i love to jerk off

genres
Action|Adventure|Mystery|Romance|Thriller     5.0
Animation|Children|Fantasy|Musical|Romance    5.0
Adventure|Fantasy|Romance|Sci-Fi|Thriller     5.0
Comedy|Drama|Film-Noir                        5.0
Adventure|Animation|Comedy                    5.0
Comedy|Crime|Mystery                          5.0
Adventure|Animation|Children|Comedy|Sci-Fi    5.0
Adventure|Comedy|Crime|Romance                5.0
Action|Horror                                 5.0
Comedy|Horror|Mystery|Thriller                5.0
Action|Mystery|Sci-Fi                         5.0
Crime|Mystery|Romance|Thriller                5.0
Crime|Film-Noir|Thriller                      5.0
Action|Crime|Romance|Thriller                 5.0
Crime|Film-Noir                               5.0
Name: 1999, dtype: float64

In [26]:
orders = pd.read_csv('data/orders.csv', sep=';', index_col=False)
products = pd.read_csv('data/products.csv', sep=';', index_col=False)
display(orders.head(3))
display(products.head(3))

Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество
0,09.11.2019 21:55:51,9,10,"Принят, ожидается оплата",Нет,Нет,Нет,103,5
1,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,86,100
2,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,104,10


Unnamed: 0,Product_ID,Name,Price,CURRENCY
0,47,Шатны Полосатый рейс,2999,RUR
1,51,Платье Аленький цветочек,4999,RUR
2,53,Штаны Цветочная Поляна,4999,RUR


### task 7
* What is the identifier of the order for which there was no information about the product?

In [57]:
orders_products = orders.merge(
    right=products,
    right_on='Product_ID',
    left_on='ID товара',
    how='left'
)
orders_products['Name']

0         Носки Подарочные, муж
1            Носки Простые, муж
2         Носки Подарочные, жен
3         Носки Подарочные, жен
4         Носки Подарочные, жен
5         Носки Подарочные, муж
6          Носки беговые Camino
7            Носки Честные, муж
8         Носки Подарочные, муж
9         Носки Подарочные, жен
10     Платье Ночная Жизнь XXXL
11       Штаны Цветочная Поляна
12    Гольфы детские Снегурочка
13         Шатны Полосатый рейс
14          Платье Ночная Жизнь
15           Носки Простые, муж
16     Платье Аленький цветочек
17                          NaN
Name: Name, dtype: object

### task 8
* Which customer generated the most total profit for the online store during the specified period?

In [73]:
orders_products['profit'] = orders_products['Количество'] * orders_products['Price']
mask = orders_products['Оплачен'] == 'Да'
orders_products[mask].groupby('ID Покупателя')['profit'].sum().sort_values(ascending=False)

ID Покупателя
7    17096.0
5    13043.0
8     1344.0
1        0.0
Name: profit, dtype: float64