## import

In [2]:
import pandas as pd
import gc

## Загружаем данные

In [3]:
df = pd.read_csv("fines.csv")
df.head()

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.0,Ford,Focus,1989
1,E432XX77RUS,1,6500.0,Toyota,Camry,1995
2,7184TT36RUS,1,2100.0,Ford,Focus,1984
3,X582HE161RUS,2,2000.0,Ford,Focus,2015
4,92918M178RUS,1,5700.0,Ford,Focus,2014


## Расчет fines/refund*yearдля каждой строки

### loop с iloc

In [4]:
%%timeit
def calc_iloc(df):
    result = []
    for i in range(len(df)):
        fines = df.iloc[i]["Fines"]
        refund = df.iloc[i]["Refund"]
        year = df.iloc[i]["Year"]
        result.append(fines / refund * year)
    return result

df["calc_iloc"] = calc_iloc(df)

134 ms ± 21.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### iterrows

In [5]:
%%timeit
def calc_iterrows(df):
    result = []
    for _, row in df.iterrows():
        result.append(row["Fines"] / row["Refund"] * row["Year"])
    return result

df["calc_iterrows"] = calc_iterrows(df)

39.3 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### apply + lambda

In [6]:
%%timeit
df["calc_apply"] = df.apply(lambda row: row["Fines"] / row["Refund"] * row["Year"], axis=1)

10.1 ms ± 2.68 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Series objects

In [7]:
%%timeit
df["calc_series"] = df["Fines"] / df["Refund"] * df["Year"]

204 µs ± 6.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Series objects + .values

In [8]:
%%timeit
df["calc_values"] = (df["Fines"].values / df["Refund"].values) * df["Year"].values

105 µs ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Indexing до и после set_index

In [9]:
%%timeit
df.loc[df["CarNumber"] == "O136HO197RUS"]

370 µs ± 9.46 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
df = df.set_index("CarNumber")
df.head()

Unnamed: 0_level_0,Refund,Fines,Make,Model,Year,calc_iloc,calc_iterrows,calc_apply,calc_series,calc_values
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,1989,3182400.0,3182400.0,3182400.0,3182400.0,3182400.0
E432XX77RUS,1,6500.0,Toyota,Camry,1995,12967500.0,12967500.0,12967500.0,12967500.0,12967500.0
7184TT36RUS,1,2100.0,Ford,Focus,1984,4166400.0,4166400.0,4166400.0,4166400.0,4166400.0
X582HE161RUS,2,2000.0,Ford,Focus,2015,2015000.0,2015000.0,2015000.0,2015000.0,2015000.0
92918M178RUS,1,5700.0,Ford,Focus,2014,11479800.0,11479800.0,11479800.0,11479800.0,11479800.0


In [11]:
%%timeit
df.loc["O136HO197RUS"]

72.2 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Downcasting

In [12]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to TEST5
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Refund         930 non-null    int64  
 1   Fines          930 non-null    float64
 2   Make           930 non-null    object 
 3   Model          919 non-null    object 
 4   Year           930 non-null    int64  
 5   calc_iloc      930 non-null    float64
 6   calc_iterrows  930 non-null    float64
 7   calc_apply     930 non-null    float64
 8   calc_series    930 non-null    float64
 9   calc_values    930 non-null    float64
dtypes: float64(6), int64(2), object(2)
memory usage: 243.3 KB


In [13]:
optimized = df.copy()

for col in optimized.select_dtypes(include=["float64"]).columns:
    optimized[col] = optimized[col].astype("float32")

for col in optimized.select_dtypes(include=["int64"]).columns:
    optimized[col] = pd.to_numeric(optimized[col], downcast="integer")

optimized.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to TEST5
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Refund         930 non-null    int8   
 1   Fines          930 non-null    float32
 2   Make           930 non-null    object 
 3   Model          919 non-null    object 
 4   Year           930 non-null    int16  
 5   calc_iloc      930 non-null    float32
 6   calc_iterrows  930 non-null    float32
 7   calc_apply     930 non-null    float32
 8   calc_series    930 non-null    float32
 9   calc_values    930 non-null    float32
dtypes: float32(6), int16(1), int8(1), object(2)
memory usage: 209.7 KB


## Categories

In [14]:
for col in optimized.select_dtypes(include=["object"]).columns:
    optimized[col] = optimized[col].astype("category")

optimized.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to TEST5
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Refund         930 non-null    int8    
 1   Fines          930 non-null    float32 
 2   Make           930 non-null    category
 3   Model          919 non-null    category
 4   Year           930 non-null    int16   
 5   calc_iloc      930 non-null    float32 
 6   calc_iterrows  930 non-null    float32 
 7   calc_apply     930 non-null    float32 
 8   calc_series    930 non-null    float32 
 9   calc_values    930 non-null    float32 
dtypes: category(2), float32(6), int16(1), int8(1)
memory usage: 115.5 KB


## Memory cleaning

In [15]:
%reset_selective -f df
gc.collect()

0