In [1]:
import pandas as pd

## 1. Читаем файл fines.csv из предыдущего задания.

In [2]:
data = pd.read_csv('../ex04/fines.csv')
data

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1984.0
3,X582HE161RUS,2,2000.0,Ford,Focus,2015.0
4,92918M178RUS,1,5700.0,Ford,Focus,2014.0
...,...,...,...,...,...,...
925,A123BC77RUS,1,5000.0,Toyota,Camry,2002.0
926,B456ME99RUS,2,750.0,Ford,Focus,2005.0
927,C789TM55RUS,1,3200.0,BMW,X5,1999.0
928,P012HC33RUS,2,1500.0,Hyundai,Solaris,2021.0


## 2. Итерации.

In [3]:
def calculation(fines, refund, year):
    return (fines/refund)*year

In [4]:
%%timeit
def calculated_data_loop(data):
    result = []
    for i in range(len(data)):
        fines = data.iloc[i]['Fines']
        refund = data.iloc[i]['Refund']
        year = data.iloc[i]['Year']
        value = calculation(fines, refund, year)
        result.append(value)
    data['CalcLoop'] = result
df_copy = data.copy()
calculated_data_loop(df_copy)

112 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
def calculated_data_iterrows(data):
    result = []
    for _, row in data.iterrows():
        fines = row['Fines']
        refund = row['Refund']
        year = row['Year']
        value = calculation(fines, refund, year)
        result.append(value)
    data['CalcIterrows'] = result
df_copy = data.copy()
calculated_data_iterrows(df_copy)    

40 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
%%timeit
df_copy = data.copy()
data['CalcApply'] = df_copy.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

7.9 ms ± 2.93 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%%timeit
data['CalcSeries'] = (data['Fines'] / data['Refund']) * data['Year']

197 μs ± 26.9 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [8]:
%%timeit
fines = data['Fines'].values
refund = data['Refund'].values
year = data['Year'].values
data['CalcValues'] = (fines / refund) * year

82.6 μs ± 15 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
data.head(6)

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,CalcApply,CalcSeries,CalcValues
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989.0,3182400.0,3182400.0,3182400.0
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995.0,12967500.0,12967500.0,12967500.0
2,7184TT36RUS,1,2100.0,Ford,Focus,1984.0,4166400.0,4166400.0,4166400.0
3,X582HE161RUS,2,2000.0,Ford,Focus,2015.0,2015000.0,2015000.0,2015000.0
4,92918M178RUS,1,5700.0,Ford,Focus,2014.0,11479800.0,11479800.0,11479800.0
5,H234YH197RUS,2,6000.0,Ford,Focus,1990.0,5970000.0,5970000.0,5970000.0


## 3. Индексация. Поиск по значению столбца и по индексу.

In [10]:
%%timeit
data[data['CarNumber'] == 'B456ME99RUS']

276 μs ± 32.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
%%timeit
data_indexed = data.set_index('CarNumber')
data_indexed.loc['B456ME99RUS']

530 μs ± 109 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## 4. Понижение.

In [12]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    int64  
 2   Fines       930 non-null    float64
 3   Make        930 non-null    object 
 4   Model       919 non-null    object 
 5   Year        930 non-null    float64
 6   CalcApply   930 non-null    float64
 7   CalcSeries  930 non-null    float64
 8   CalcValues  930 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 196.7 KB


In [13]:
optimized = data.copy()

In [14]:
float_cols = optimized.select_dtypes(include=['float64']).columns
optimized[float_cols] = optimized[float_cols].astype('float32')

In [15]:
int_cols = optimized.select_dtypes(include=['int64']).columns
optimized[int_cols] = optimized[int_cols].apply(pd.to_numeric, downcast='integer')

In [16]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    int8   
 2   Fines       930 non-null    float32
 3   Make        930 non-null    object 
 4   Model       919 non-null    object 
 5   Year        930 non-null    float32
 6   CalcApply   930 non-null    float32
 7   CalcSeries  930 non-null    float32
 8   CalcValues  930 non-null    float32
dtypes: float32(5), int8(1), object(3)
memory usage: 172.1 KB


## 5. Категории.

In [17]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    int64  
 2   Fines       930 non-null    float64
 3   Make        930 non-null    object 
 4   Model       919 non-null    object 
 5   Year        930 non-null    float64
 6   CalcApply   930 non-null    float64
 7   CalcSeries  930 non-null    float64
 8   CalcValues  930 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 196.7 KB


In [18]:
object_cols = data.select_dtypes(include='object').columns
data[object_cols] = data[object_cols].astype('category')

In [19]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   CarNumber   930 non-null    category
 1   Refund      930 non-null    int64   
 2   Fines       930 non-null    float64 
 3   Make        930 non-null    category
 4   Model       919 non-null    category
 5   Year        930 non-null    float64 
 6   CalcApply   930 non-null    float64 
 7   CalcSeries  930 non-null    float64 
 8   CalcValues  930 non-null    float64 
dtypes: category(3), float64(5), int64(1)
memory usage: 97.1 KB


## 6. Очистка памяти.

In [20]:
%reset_selective -f data

In [21]:
import gc
gc.collect()

7