## Import Pandas

In [1]:
import pandas as pd

In [2]:
import numpy as np

## Подготовка

In [3]:
from datetime import datetime

In [4]:
import matplotlib.pyplot as plt

In [5]:
from pandas.plotting import register_matplotlib_converters

# конвертеры, которые позволяют использовать типы pandas в matplotlib 

In [None]:
import seaborn as sns

In [None]:
import plotly.express as px

In [None]:
from plotly import graph_objects as go

In [None]:
import math as mth

In [None]:
import scipy.stats

In [None]:
from scipy import stats as st

In [None]:
import sys

In [None]:
import getopt

In [None]:
from sqlalchemy import create_engine

In [None]:
import os

### Опции

In [None]:
# pd.set_option('max_rows', 5)

# количество строк в таблице не больше 5, чтобы не писать head() за каждым фреймом
# при необходимости раскрыть из комментария полный вывод данных в колонке
# pd.set_option('display.max_colwidth', None)

In [6]:
pd.set_option('display.float_format', '{:,.2f}'.format)
# разделитель ',' и два знака после запятой у чисел с плавающей точкой

#### опции для графики

In [7]:
large = 16; med = 12; small = 10
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (12, 8),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}

plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')

In [None]:
sns.set_palette('deep') 
sns.set_style("whitegrid")

### Заливка данных

#### чтение csv-файла

In [45]:
# df = pd.read_csv('data.csv', encoding='unicode_escape')

In [50]:
url = 'https://drive.google.com/file/d/1_VOK7a2pUxw4wfwPp9HDWeqNDM2vJifD/view?usp=sharing'
url ='https://drive.google.com/uc?id=' + url.split('/')[-2]
# to read csv file from google drive

In [51]:
df = pd.read_csv(url, encoding='unicode_escape')

In [52]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.00,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.00,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.00,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.00,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.00,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.00,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.00,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.00,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.00,France


#### ознакомление с данными

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [13]:
df.isna().sum()
# поиск пропущенных значений

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [14]:
df.duplicated().sum()
#поиск абсолютных дубликатов

5268

### Предобработка

#### переименование столбцов

In [None]:
df = df.rename(columns=lambda x: x.lower().replace(' ', '_'))

#### удаление столбца

In [None]:
df = df.drop('name_collumn', 1)

#### удаление дубликатов

In [None]:
df = df.drop_duplicates().reset_index(drop=True)
# удаление дубликатов

#### преобразование строк в дату и время

In [16]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [21]:
df['month'] = df['InvoiceDate'].dt.month

In [29]:
df[df['CustomerID'] == 12680.00].head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,month
305779,563712,21981,PACK OF 12 WOODLAND TISSUES,24,2011-08-18 15:44:00,0.29,12680.0,France,8
305780,563712,21986,PACK OF 12 PINK POLKADOT TISSUES,24,2011-08-18 15:44:00,0.29,12680.0,France,8
305781,563712,22037,ROBOT BIRTHDAY CARD,12,2011-08-18 15:44:00,0.42,12680.0,France,8


In [32]:
df.groupby('CustomerID')['UnitPrice'].sum().sort_values(ascending=False)

CustomerID
14,096.00   41,376.33
15,098.00   40,278.90
14,911.00   31,060.66
12,744.00   25,108.89
16,029.00   24,111.14
               ...   
17,752.00        0.42
13,366.00        0.39
12,875.00        0.34
15,118.00        0.17
13,256.00        0.00
Name: UnitPrice, Length: 4372, dtype: float64

In [34]:
df['UnitPrice'].sum()

2498803.974

In [43]:
agg_func_math = {
    'UnitPrice': ['count', 'sum', 'mean', 'median', 'min', 'max', 'std', 'var']
}
# задаем список функций по определенному столбцу

In [44]:
df.groupby('CustomerID', dropna=False).agg(agg_func_math).round(2)

Unnamed: 0_level_0,UnitPrice,UnitPrice,UnitPrice,UnitPrice,UnitPrice,UnitPrice,UnitPrice,UnitPrice
Unnamed: 0_level_1,count,sum,mean,median,min,max,std,var
CustomerID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
12346.00,2,2.08,1.04,1.04,1.04,1.04,0.00,0.00
12347.00,182,481.21,2.64,2.02,0.25,12.75,2.26,5.09
12348.00,31,178.71,5.76,0.55,0.29,40.00,13.40,179.57
12349.00,73,605.10,8.29,2.55,0.42,300.00,35.03,1226.96
12350.00,17,65.30,3.84,1.65,0.85,40.00,9.33,87.14
...,...,...,...,...,...,...,...,...
18281.00,7,39.36,5.62,1.65,0.42,16.95,7.78,60.51
18282.00,13,62.68,4.82,2.95,0.29,12.75,4.86,23.65
18283.00,756,1220.93,1.61,1.65,0.29,15.95,1.10,1.20
18287.00,70,104.55,1.49,1.25,0.29,8.50,1.23,1.51
