# Pandas

Joining DataFrames


In [1]:
import pandas as pd

In [2]:
df1 = pd.DataFrame(
    {
        'id': [1, 2, 3, 4],
        'name': ['Alice', 'Allakim', 'Kimbu', 'yangisi']
    }
)
df2 = pd.DataFrame(
    {
        'id': [6, 7, 8],
        'name': ['Bobir', 'Alkimyogar', 'Sardor']
    }
)

In [3]:
df1

Unnamed: 0,id,name
0,1,Alice
1,2,Allakim
2,3,Kimbu
3,4,yangisi


In [4]:
df2 + df1

Unnamed: 0,id,name
0,7.0,BobirAlice
1,9.0,AlkimyogarAllakim
2,11.0,SardorKimbu
3,,


In [5]:
# pd.concat([df1, df2])
pd.concat([df1, df2], axis=1)

Unnamed: 0,id,name,id.1,name.1
0,1,Alice,6.0,Bobir
1,2,Allakim,7.0,Alkimyogar
2,3,Kimbu,8.0,Sardor
3,4,yangisi,,


In [6]:
df1 = pd.DataFrame(
    {
        'id': [1, 2, 3],
        'name': ['Alice', 'Allakim', 'Kimbu']
    }
)
df2 = pd.DataFrame(
    {
        'id': [2, 3, 6, 7],
        # 'name': ['Allakim', 'Kimbu', 'Unknown', 'someone'],
        'score': [50, 60, 70, 90]
    }
)

In [7]:
df1

Unnamed: 0,id,name
0,1,Alice
1,2,Allakim
2,3,Kimbu


In [8]:
df2

Unnamed: 0,id,score
0,2,50
1,3,60
2,6,70
3,7,90


In [9]:
result = pd.merge(df1, df2, suffixes=['_birinchi', '_ikkinchi'], on=('id', 'name'))
result.columns = ['Shaxsiy raqami', 'Ismi', 'Bali']
result

KeyError: 'name'

In [None]:
result = pd.merge(df1, df2, on='id', how='left')
result #           ^    ^                   ^

Unnamed: 0,id,name_x,name_y,score
0,1,Alice,,
1,2,Allakim,Allakim,50.0
2,3,Kimbu,Kimbu,60.0


In [18]:
result = pd.merge(df1, df2, on='id', how='right')
result

Unnamed: 0,id,name_x,name_y,score
0,2,Allakim,Allakim,50
1,3,Kimbu,Kimbu,60
2,6,,Unknown,70
3,7,,someone,90


In [27]:
result = pd.merge(df1, df2, on='id', how='outer')
result

Unnamed: 0,id,name,score
0,1,Alice,
1,2,Allakim,50.0
2,3,Kimbu,60.0
3,6,,70.0
4,7,,90.0


In [25]:
result.sort_values(by=['score'])

Unnamed: 0,id,name,score
1,2,Allakim,50.0
2,3,Kimbu,60.0
3,6,Unknown,70.0
4,7,someone,90.0
0,1,Alice,


In [29]:
res_temp = pd.merge(df1, df2, on='id', how='left')
res_temp

Unnamed: 0,id,name,score
0,1,Alice,
1,2,Allakim,50.0
2,3,Kimbu,60.0


In [30]:
res_temp[res_temp['score'].isna()].drop(columns=['score'])

Unnamed: 0,id,name
0,1,Alice


### Grouping

In [31]:
data = {
    'Department': ['HR', 'HR', 'IT', 'IT', 'Finance', 'Finance', 'Finance'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'John'],
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Department,Employee,Salary
0,HR,Alice,50000
1,HR,Bob,60000
2,IT,Charlie,70000
3,IT,David,80000
4,Finance,Eva,90000
5,Finance,Frank,100000
6,Finance,John,110000


In [33]:
df['Salary'].max()

np.int64(110000)

In [35]:
unique_department = df['Department'].drop_duplicates().tolist()
unique_department

['HR', 'IT', 'Finance']

In [44]:
for departments in unique_department:
    filtered = df[df['Department'] == departments]
    # display(filtered)
    # print('-'*10)
    max_sal = filtered['Salary'].max()
    print(f'Max salary for the {departments}: {max_sal}')

Max salary for the HR: 60000
Max salary for the IT: 80000
Max salary for the Finance: 110000


In [59]:
result = df.groupby('Department')['Salary']
type(result)

pandas.core.groupby.generic.SeriesGroupBy

In [60]:
result.max()

Department
Finance    110000
HR          60000
IT          80000
Name: Salary, dtype: int64

In [66]:
max_df = df.groupby('Department')['Salary'].max()
min_df = df.groupby('Department')['Salary'].min()
pd.merge(max_df, min_df, left_index=True, right_index=True, suffixes=('-Maximum', '-Minimum'))

Unnamed: 0_level_0,Salary-Maximum,Salary-Minimum
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,110000,90000
HR,60000,50000
IT,80000,70000


In [70]:
max_df.reset_index() # drop=True

Unnamed: 0,Department,Salary
0,Finance,110000
1,HR,60000
2,IT,80000


In [73]:
df.groupby('Department')['Salary'].agg(['min', 'max', 'mean', 'median']).reset_index()

Unnamed: 0,Department,min,max,mean,median
0,Finance,90000,110000,100000.0,100000.0
1,HR,50000,60000,55000.0,55000.0
2,IT,70000,80000,75000.0,75000.0


In [75]:
def salary_range(x):
    return x.max() - x.min()

df.groupby('Department')['Salary'].agg(['min', 'max', salary_range])

Unnamed: 0_level_0,min,max,salary_range
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Finance,90000,110000,20000
HR,50000,60000,10000
IT,70000,80000,10000


In [26]:
data = {
    'Department': ['HR', 'HR', 'IT', 'IT', 'Finance', 'Finance'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000],
    'Bonus': [5000, 6000, 7000, 8000, 9000, 10000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Department,Employee,Salary,Bonus
0,HR,Alice,50000,5000
1,HR,Bob,60000,6000
2,IT,Charlie,70000,7000
3,IT,David,80000,8000
4,Finance,Eva,90000,9000
5,Finance,Frank,100000,10000


In [27]:
df.groupby('Department')[['Salary', 'Bonus']].agg(['min', 'count'])
# ['Bonus']            ^^^^^^^^^^^^^^^^^^^^^^                 

Unnamed: 0_level_0,Salary,Salary,Bonus,Bonus
Unnamed: 0_level_1,min,count,min,count
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Finance,90000,2,9000,2
HR,50000,2,5000,2
IT,70000,2,7000,2


In [28]:
result = df.groupby('Department').agg({
    'Salary': ['max', 'min', 'mean'],
    'Bonus': ['sum', 'mean'],
    'Employee': ['count']
})

In [29]:
result

Unnamed: 0_level_0,Salary,Salary,Salary,Bonus,Bonus,Employee
Unnamed: 0_level_1,max,min,mean,sum,mean,count
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Finance,100000,90000,95000.0,19000,9500.0,2
HR,60000,50000,55000.0,11000,5500.0,2
IT,80000,70000,75000.0,15000,7500.0,2


In [30]:
result.columns.get_level_values(1)

Index(['max', 'min', 'mean', 'sum', 'mean', 'count'], dtype='object')

**Apply**

In [31]:
df['Salary'] * 2

0    100000
1    120000
2    140000
3    160000
4    180000
5    200000
Name: Salary, dtype: int64

In [32]:
df['Salary'].apply(lambda x: x * 2)

0    100000
1    120000
2    140000
3    160000
4    180000
5    200000
Name: Salary, dtype: int64

In [33]:
def demo(x):
    print(type(x))
    return x * 2

df['Salary'].apply(demo)

<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>


0    100000
1    120000
2    140000
3    160000
4    180000
5    200000
Name: Salary, dtype: int64

In [34]:
df

Unnamed: 0,Department,Employee,Salary,Bonus
0,HR,Alice,50000,5000
1,HR,Bob,60000,6000
2,IT,Charlie,70000,7000
3,IT,David,80000,8000
4,Finance,Eva,90000,9000
5,Finance,Frank,100000,10000


In [35]:
df.apply(demo, axis=1)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


Unnamed: 0,Department,Employee,Salary,Bonus
0,HRHR,AliceAlice,100000,10000
1,HRHR,BobBob,120000,12000
2,ITIT,CharlieCharlie,140000,14000
3,ITIT,DavidDavid,160000,16000
4,FinanceFinance,EvaEva,180000,18000
5,FinanceFinance,FrankFrank,200000,20000


In [36]:
def debug(x):
    display(x)
    print('-'*50)
    return x

df.apply(debug, axis=1)

Department       HR
Employee      Alice
Salary        50000
Bonus          5000
Name: 0, dtype: object

--------------------------------------------------


Department       HR
Employee        Bob
Salary        60000
Bonus          6000
Name: 1, dtype: object

--------------------------------------------------


Department         IT
Employee      Charlie
Salary          70000
Bonus            7000
Name: 2, dtype: object

--------------------------------------------------


Department       IT
Employee      David
Salary        80000
Bonus          8000
Name: 3, dtype: object

--------------------------------------------------


Department    Finance
Employee          Eva
Salary          90000
Bonus            9000
Name: 4, dtype: object

--------------------------------------------------


Department    Finance
Employee        Frank
Salary         100000
Bonus           10000
Name: 5, dtype: object

--------------------------------------------------


Unnamed: 0,Department,Employee,Salary,Bonus
0,HR,Alice,50000,5000
1,HR,Bob,60000,6000
2,IT,Charlie,70000,7000
3,IT,David,80000,8000
4,Finance,Eva,90000,9000
5,Finance,Frank,100000,10000


In [37]:
df['NameWithSalary'] = df.apply(lambda x: x['Employee'] + '-' + str(x['Salary']), axis=1)

In [38]:
df

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000
2,IT,Charlie,70000,7000,Charlie-70000
3,IT,David,80000,8000,David-80000
4,Finance,Eva,90000,9000,Eva-90000
5,Finance,Frank,100000,10000,Frank-100000


In [39]:
df['Department'].iloc[0:2]

0    HR
1    HR
Name: Department, dtype: object

In [40]:
df.apply(lambda x: x.iloc[:2])

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000


In [42]:
df_new = df.set_index('Employee')
df_new

Unnamed: 0_level_0,Department,Salary,Bonus,NameWithSalary
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alice,HR,50000,5000,Alice-50000
Bob,HR,60000,6000,Bob-60000
Charlie,IT,70000,7000,Charlie-70000
David,IT,80000,8000,David-80000
Eva,Finance,90000,9000,Eva-90000
Frank,Finance,100000,10000,Frank-100000


In [48]:
df_new.loc['Alice']

Department                 HR
Salary                  50000
Bonus                    5000
NameWithSalary    Alice-50000
Name: Alice, dtype: object

In [50]:
df_new.iloc[0]

Department                 HR
Salary                  50000
Bonus                    5000
NameWithSalary    Alice-50000
Name: Alice, dtype: object

In [52]:
df.drop('NameWithSalary', axis=1)

Unnamed: 0,Department,Employee,Salary,Bonus
0,HR,Alice,50000,5000
1,HR,Bob,60000,6000
2,IT,Charlie,70000,7000
3,IT,David,80000,8000
4,Finance,Eva,90000,9000
5,Finance,Frank,100000,10000


In [51]:
df.drop(columns=['NameWithSalary'])

Unnamed: 0,Department,Employee,Salary,Bonus
0,HR,Alice,50000,5000
1,HR,Bob,60000,6000
2,IT,Charlie,70000,7000
3,IT,David,80000,8000
4,Finance,Eva,90000,9000
5,Finance,Frank,100000,10000


In [53]:
df

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000
2,IT,Charlie,70000,7000,Charlie-70000
3,IT,David,80000,8000,David-80000
4,Finance,Eva,90000,9000,Eva-90000
5,Finance,Frank,100000,10000,Frank-100000


In [54]:
df.map(lambda x: x*2)

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HRHR,AliceAlice,100000,10000,Alice-50000Alice-50000
1,HRHR,BobBob,120000,12000,Bob-60000Bob-60000
2,ITIT,CharlieCharlie,140000,14000,Charlie-70000Charlie-70000
3,ITIT,DavidDavid,160000,16000,David-80000David-80000
4,FinanceFinance,EvaEva,180000,18000,Eva-90000Eva-90000
5,FinanceFinance,FrankFrank,200000,20000,Frank-100000Frank-100000


In [55]:
df.apply(lambda x: x*2)

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HRHR,AliceAlice,100000,10000,Alice-50000Alice-50000
1,HRHR,BobBob,120000,12000,Bob-60000Bob-60000
2,ITIT,CharlieCharlie,140000,14000,Charlie-70000Charlie-70000
3,ITIT,DavidDavid,160000,16000,David-80000David-80000
4,FinanceFinance,EvaEva,180000,18000,Eva-90000Eva-90000
5,FinanceFinance,FrankFrank,200000,20000,Frank-100000Frank-100000


In [58]:
df

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000
2,IT,Charlie,70000,7000,Charlie-70000
3,IT,David,80000,8000,David-80000
4,Finance,Eva,90000,9000,Eva-90000
5,Finance,Frank,100000,10000,Frank-100000


In [56]:
def debug(x):
    print(type(x))
    print(x)
    print('-'*50)
    return x
df.map(debug)

<class 'str'>
HR
--------------------------------------------------
<class 'str'>
HR
--------------------------------------------------
<class 'str'>
IT
--------------------------------------------------
<class 'str'>
IT
--------------------------------------------------
<class 'str'>
Finance
--------------------------------------------------
<class 'str'>
Finance
--------------------------------------------------
<class 'str'>
Alice
--------------------------------------------------
<class 'str'>
Bob
--------------------------------------------------
<class 'str'>
Charlie
--------------------------------------------------
<class 'str'>
David
--------------------------------------------------
<class 'str'>
Eva
--------------------------------------------------
<class 'str'>
Frank
--------------------------------------------------
<class 'int'>
50000
--------------------------------------------------
<class 'int'>
60000
--------------------------------------------------
<class 'int'>
70

Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000
2,IT,Charlie,70000,7000,Charlie-70000
3,IT,David,80000,8000,David-80000
4,Finance,Eva,90000,9000,Eva-90000
5,Finance,Frank,100000,10000,Frank-100000


In [60]:
df.apply(debug, axis=0)

<class 'pandas.core.series.Series'>
0         HR
1         HR
2         IT
3         IT
4    Finance
5    Finance
Name: Department, dtype: object
--------------------------------------------------
<class 'pandas.core.series.Series'>
0      Alice
1        Bob
2    Charlie
3      David
4        Eva
5      Frank
Name: Employee, dtype: object
--------------------------------------------------
<class 'pandas.core.series.Series'>
0     50000
1     60000
2     70000
3     80000
4     90000
5    100000
Name: Salary, dtype: int64
--------------------------------------------------
<class 'pandas.core.series.Series'>
0     5000
1     6000
2     7000
3     8000
4     9000
5    10000
Name: Bonus, dtype: int64
--------------------------------------------------
<class 'pandas.core.series.Series'>
0      Alice-50000
1        Bob-60000
2    Charlie-70000
3      David-80000
4        Eva-90000
5     Frank-100000
Name: NameWithSalary, dtype: object
--------------------------------------------------


Unnamed: 0,Department,Employee,Salary,Bonus,NameWithSalary
0,HR,Alice,50000,5000,Alice-50000
1,HR,Bob,60000,6000,Bob-60000
2,IT,Charlie,70000,7000,Charlie-70000
3,IT,David,80000,8000,David-80000
4,Finance,Eva,90000,9000,Eva-90000
5,Finance,Frank,100000,10000,Frank-100000


## $$pipe$$