In [1]:
import pandas as pd

In [2]:
salaries = [
	("Mark", 1000),
	("John", 1500),
	("Daniel", 2300),
	("Greg", 5000)
]

In [3]:
names_series = pd.Series(["Mark", "John", "Daniel", "Greg"])
names_series

0      Mark
1      John
2    Daniel
3      Greg
dtype: object

In [4]:
salary_series = pd.Series([1000, 1500, 2300, 5000])
salary_series

0    1000
1    1500
2    2300
3    5000
dtype: int64

In [5]:
names_series.describe()

count        4
unique       4
top       Mark
freq         1
dtype: object

In [6]:
salary_series.describe()

count       4.000000
mean     2450.000000
std      1782.320585
min      1000.000000
25%      1375.000000
50%      1900.000000
75%      2975.000000
max      5000.000000
dtype: float64

Była podwyżka. Zwiększamy.

In [9]:
new_salaries = salary_series + 100
new_salaries.describe()

count       4.000000
mean     2550.000000
std      1782.320585
min      1100.000000
25%      1475.000000
50%      2000.000000
75%      3075.000000
max      5100.000000
dtype: float64

In [11]:
salary_series.index = names_series
salary_series

Mark      1000
John      1500
Daniel    2300
Greg      5000
dtype: int64

In [13]:
names = ["Mark", "John", "Daniel", "Greg"]
salaries = [1000, 1500, 2300, 5000]
salary_series_improved = pd.Series(salaries, index=names)
salary_series_improved

Mark      1000
John      1500
Daniel    2300
Greg      5000
dtype: int64

Dataframe-y

In [14]:
salaries = [
	("Mark", 1000, 23),
	("John", 1500, 25),
	("Daniel", 2300, 38),
	("Greg", 5000, 42)
]

df = pd.DataFrame(salaries)
df

Unnamed: 0,0,1,2
0,Mark,1000,23
1,John,1500,25
2,Daniel,2300,38
3,Greg,5000,42


In [15]:
df = pd.DataFrame(salaries, columns=["name", "salary", "age"])
df = df.set_index("name")
df

Unnamed: 0_level_0,salary,age
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mark,1000,23
John,1500,25
Daniel,2300,38
Greg,5000,42


In [16]:
df.describe()


Unnamed: 0,salary,age
count,4.0,4.0
mean,2450.0,32.0
std,1782.320585,9.416298
min,1000.0,23.0
25%,1375.0,24.5
50%,1900.0,31.5
75%,2975.0,39.0
max,5000.0,42.0


# Która metoda jest szybsza? `pd.Series + x` czy `apply()` ?

In [17]:
import pandas as pd
import numpy as np
import time

# Rozmiary do testów
sizes = [10_000, 100_000, 1_000_000, 5_000_000]

def benchmark(size):
    print(f"\n=== TEST dla {size:,} elementów ===")

    s = pd.Series(np.random.randint(3000, 8000, size=size))

    # --- apply(lambda ...) ---
    start = time.perf_counter()
    s_apply = s.apply(lambda x: x + 2000)
    t_apply = time.perf_counter() - start

    # --- operacja wektorowa ---
    start = time.perf_counter()
    s_vec = s + 2000
    t_vec = time.perf_counter() - start

    print(f"apply(lambda): {t_apply:.6f} s")
    print(f"wektorowo:     {t_vec:.6f} s")
    print(f"Różnica:       {t_apply / t_vec:.1f}× wolniej (apply)")

for size in sizes:
    benchmark(size)



=== TEST dla 10,000 elementów ===
apply(lambda): 0.041618 s
wektorowo:     0.000448 s
Różnica:       92.9× wolniej (apply)

=== TEST dla 100,000 elementów ===
apply(lambda): 0.321929 s
wektorowo:     0.002103 s
Różnica:       153.1× wolniej (apply)

=== TEST dla 1,000,000 elementów ===
apply(lambda): 2.488269 s
wektorowo:     0.005373 s
Różnica:       463.1× wolniej (apply)

=== TEST dla 5,000,000 elementów ===
apply(lambda): 5.530371 s
wektorowo:     0.018241 s
Różnica:       303.2× wolniej (apply)
