In [76]:
import pandas as pd
import gc

In [77]:
df = pd.read_csv('../../datasets/fines.csv')

расчет fines/refund*year c помощью loop

In [78]:
def calculate_with_loop(df):
    results = []
    for i in range(0, len(df)):
        row = df.iloc[i]
        result = row['Fines'] / row['Refund'] * row['Year']
        results.append(result)
    return results

In [79]:
%%timeit
df['Calculated'] = calculate_with_loop(df)

43.4 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


c помощью iterrows()

In [80]:
def calculate_with_iterrows(df):
    results = []
    for index, row in df.iterrows():
        result = row['Fines'] / row['Refund'] * row['Year']
        results.append(result)
    return results

In [81]:
%%timeit
df['Calculated'] = calculate_with_iterrows(df)

36.3 ms ± 823 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


с помощью apply()

In [82]:
def calculate_with_apply(df):
    return df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)

In [83]:
%%timeit
df['Calculated'] = calculate_with_apply(df)

8.19 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


с помощью Series

In [84]:
def calculate_with_series(df):
    return df['Fines'] / df['Refund'] * df['Year']

In [85]:
%%timeit
df['Calculated'] = calculate_with_series(df)

203 μs ± 14.3 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


с помощью .values

In [86]:
def calculate_with_values(df):
    return df['Fines'].values / df['Refund'].values * df['Year'].values

In [87]:
%%timeit
df['Calculated'] = calculate_with_values(df)

111 μs ± 15.1 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [88]:
df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,Calculated
0,Y163O8161RUS,2.0,3200.0,Ford,Focus,1989,3182400.0
1,E432XX77RUS,1.0,6500.0,Toyota,Camry,1995,12967500.0
2,7184TT36RUS,1.0,2100.0,Ford,Focus,1984,4166400.0
3,X582HE161RUS,2.0,2000.0,Ford,Focus,2015,2015000.0
4,92918M178RUS,1.0,5700.0,Ford,Focus,2014,11479800.0
...,...,...,...,...,...,...,...
925,XYZ123RUS,2.0,0.0,Tesla,Model 3,2002,0.0
926,ABC456RUS,1.0,7500.0,BMW,X5,2009,15067500.0
927,DEF789RUS,1.0,3200.0,Audi,A4,2002,6406400.0
928,GHI012RUS,2.0,1500.0,Mercedes,C-Class,2007,1505250.0


измеряем время с помощью волшебной команды %%timeitв ячейке;получаем строку для определенного CarNumber, например,’O136HO197RUS’; устанавливаем индекс в вашем фрейме данных с помощью CarNumber;снова, получаем ряд за то же самое CarNumber

In [89]:
example_car = 'O136HO197RUS'

In [90]:
def get_row_without_index(df, car_number):
    return df[df['CarNumber'] == car_number]

In [91]:
%%timeit
get_row_without_index(df, example_car)

261 μs ± 8.65 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [92]:
def get_row_with_index(df, car_number):
    df_indexed = df.set_index('CarNumber')
    return df_indexed.loc[car_number]

In [93]:
%%timeit
get_row_with_index(df, example_car)

509 μs ± 20.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


запускаем df.info(memory_usage=’deep’), смотрим на Dtype и использование памяти;
преобразовываем copy() наш исходный фрейм данных в другой фрейм данных - optimized
преобразуем float64 в float32 для всех столбцов
понижаем от int64 до наименьшего возможного числового типа
запускаем info(memory_usage='deep') для нашего нового фрейма данных,смотрим на Dtype и использование памяти

In [94]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    float64
 2   Fines       930 non-null    float64
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int64  
 6   Calculated  930 non-null    float64
dtypes: float64(3), int64(1), object(3)
memory usage: 203.7 KB


In [95]:
optimized = df.copy()

In [96]:
optimized['Calculated'] = optimized['Calculated'].astype('float32')
float_cols = optimized.select_dtypes(include=['float64']).columns
for col in float_cols:
    optimized[col] = pd.to_numeric(optimized[col], downcast='float')

In [97]:
int_cols = optimized.select_dtypes(include=['int64']).columns
for col in int_cols:
    optimized[col] = pd.to_numeric(optimized[col], downcast='integer')

In [98]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CarNumber   930 non-null    object 
 1   Refund      930 non-null    float32
 2   Fines       930 non-null    float32
 3   Make        930 non-null    object 
 4   Model       918 non-null    object 
 5   Year        930 non-null    int16  
 6   Calculated  930 non-null    float32
dtypes: float32(3), int16(1), object(3)
memory usage: 187.4 KB


меняем object тип столбцов на тип category и проверяем использование памяти

In [99]:
object_cols = optimized.select_dtypes(include=['object']).columns
for col in object_cols:
    optimized[col] = optimized[col].astype('category')

In [100]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 930 entries, 0 to 929
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   CarNumber   930 non-null    category
 1   Refund      930 non-null    float32 
 2   Fines       930 non-null    float32 
 3   Make        930 non-null    category
 4   Model       918 non-null    category
 5   Year        930 non-null    int16   
 6   Calculated  930 non-null    float32 
dtypes: category(3), float32(3), int16(1)
memory usage: 70.9 KB


используем %reset_selective и библиотеку gc очищаем память

In [101]:
%reset_selective -f df

In [102]:
del df
gc.collect()

NameError: name 'df' is not defined

In [None]:
df