# Pandas Tutorial

Pandas-ը պիտոնի գրադարան է, որը թույլ է տալիս հեշտությամբ աշխատել մեծ տվյալների հետ և դրանք անալիզ անել։

Պանդան ունի երկու տվյալային կառուցվածք:

    Series
    DataFrame

### Importing the modules

In [None]:
#!pip install pandas

In [1]:
import pandas as pd
import numpy as np 

## Pandas.series
Series-ը միաչափ զանգված է։ Դրա տարրերը պիտակավորված են և նույն տիպի են։


### Create empty series

In [4]:
s = pd.Series(dtype=np.int32)

In [5]:
s

Series([], dtype: int32)

### Create series from ndarray

In [6]:
data = np.array(['a', 'b', 'c', 'd'])

s = pd.Series(data)

In [7]:
s

0    a
1    b
2    c
3    d
dtype: object

In [8]:
s = pd.Series(data, index=[2, 4, 6, 8])

In [9]:
s

2    a
4    b
6    c
8    d
dtype: object

In [12]:
s = pd.Series(data, index=['2aa', '4', '6', '8'])

In [13]:
s

2aa    a
4      b
6      c
8      d
dtype: object

### Create series from scalar

In [14]:
scalar_s = pd.Series(4, index=[1, 2, 3, 4, 5, 6, 7])

In [15]:
scalar_s

1    4
2    4
3    4
4    4
5    4
6    4
7    4
dtype: int64

### Create series from dictionary

In [16]:
data = {'a': 1, 'b': 32, 'c': 42}

s = pd.Series(data)

In [17]:
s

a     1
b    32
c    42
dtype: int64

In [18]:
print(s)

a     1
b    32
c    42
dtype: int64


### Accessing elements with index

In [19]:
s[0:2]

a     1
b    32
dtype: int64

In [21]:
s['a':'b']

a     1
b    32
dtype: int64

## Pandas.DataFrame
DataFrame-ը երկչափանի տվյալների կառուցվածք է։ Ամեն սյուն մի տիպի փոփոխական է պահում, սակայն տարբեր սյուներ կարող են պահել տարբեր տիպեր։

pandas.DataFrame(data, index, columns, dtype)

### Create an empty dataframe

In [22]:
df = pd.DataFrame()

print(df)

Empty DataFrame
Columns: []
Index: []


### Create dataframe from list

In [34]:
data = np.random.randint(0, 20, 20)
data

array([16, 17, 17, 18,  5, 17, 16,  1, 18,  3, 14,  9,  2,  2,  8, 19,  8,
       12,  3, 15])

In [None]:
df = pd.DataFrame(data, columns=['Column 1'], index=range(1, 21))
df

In [33]:
data = [['Aram', 24, 170], ['Anna', 25, 170], ['Monty', 26, 171]]

df = pd.DataFrame(data, columns=['Name', 'Age', 'Height'], index=[1, 2, 3])
df

Unnamed: 0,Name,Age,Height
1,Aram,24,170
2,Anna,25,170
3,Monty,26,171


### Create dataframe from a dictionary

In [37]:
data = {'Name': ['Aram', 'Anna', 'Monty'], 'Age': [24, 25, 26], 'Height': [176, 176, 189]}

df = pd.DataFrame(data, index=[1, 2, 3])
df

Unnamed: 0,Name,Age,Height
1,Aram,24,176
2,Anna,25,176
3,Monty,26,189


In [None]:
df1.dtypes

### Give column names

In [39]:
df.columns = ['First Name', 'ID', 'Height']

Unnamed: 0,First Name,ID,Height
1,Aram,24,176
2,Anna,25,176
3,Monty,26,189


### Create dataframe from a list of dictionaries

In [43]:
data = [{'First Name': 'Aram', 'Age': 24, 'Height': 178},
        {'Name': 'Anna', 'Age': 54, 'Height': 199},
        {'Name': 'Monty', 'Age': 231, 'Height': 200, 'E-mail': 'monty.python@gmail.com'}]

df = pd.DataFrame(data)
df

Unnamed: 0,First Name,Age,Height,Name,E-mail
0,Aram,24,178,,
1,,54,199,Anna,
2,,231,200,Monty,monty.python@gmail.com


### Selecting columns

In [49]:
df_1 = df[['Height', 'Age', 'E-mail']]

In [50]:
df_1

Unnamed: 0,Height,Age,E-mail
0,178,24,
1,199,54,
2,200,231,monty.python@gmail.com


### .loc, .iloc(use integer indexing)

In [51]:
df

Unnamed: 0,First Name,Age,Height,Name,E-mail
0,Aram,24,178,,
1,,54,199,Anna,
2,,231,200,Monty,monty.python@gmail.com


In [55]:
df.loc[1:2, 'Height':'E-mail']

Unnamed: 0,Height,Name,E-mail
1,199,Anna,
2,200,Monty,monty.python@gmail.com


In [58]:
df.iloc[1:3, 1:-1]

Unnamed: 0,Age,Height,Name
1,54,199,Anna
2,231,200,Monty


In [59]:
df[df['Name'] == 'Anna']

Unnamed: 0,First Name,Age,Height,Name,E-mail
1,,54,199,Anna,


In [60]:
df['Name'] == 'Anna'

0    False
1     True
2    False
Name: Name, dtype: bool

In [61]:
df[df['Age'] > 30]

Unnamed: 0,First Name,Age,Height,Name,E-mail
1,,54,199,Anna,
2,,231,200,Monty,monty.python@gmail.com


In [65]:
df[(df['Age'] > 50) | (df['Name'] == 'Anna')]

Unnamed: 0,First Name,Age,Height,Name,E-mail
1,,54,199,Anna,
2,,231,200,Monty,monty.python@gmail.com


In [66]:
df[(df['Age'] < 50) & (df['Name'] == 'Anna')]

Unnamed: 0,First Name,Age,Height,Name,E-mail


### Filtering by a column condition

In [None]:
df[(df['Age'] < 50) & (df['Name'] == 'Anna')]

### Adding a column

In [68]:
df['Gender'] = ['m', 'f', 'm']

In [69]:
df

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender
0,Aram,24,178,,,m
1,,54,199,Anna,,f
2,,231,200,Monty,monty.python@gmail.com,m


In [74]:
df['Employed'] = True

In [71]:
df

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender,Employed
0,Aram,24,178,,,m,True
1,,54,199,Anna,,f,True
2,,231,200,Monty,monty.python@gmail.com,m,True


Inserting column to a location: 
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.insert.html

### Column deletion

In [72]:
del df['Employed']

In [76]:
df

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender,Employed
0,Aram,24,178,,,m,True
1,,54,199,Anna,,f,True
2,,231,200,Monty,monty.python@gmail.com,m,True


In [79]:
df_1 = df.drop('Employed', axis=1)

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender
0,Aram,24,178,,,m
1,,54,199,Anna,,f
2,,231,200,Monty,monty.python@gmail.com,m


In [81]:
df.drop('Employed', axis=1, inplace=True)

In [89]:
df_2 = df.drop(1, axis=0)

In [90]:
df_2

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender
2,,231,200,Monty,monty.python@gmail.com,m


In [91]:
df.drop(1, axis=0)

Unnamed: 0,First Name,Age,Height,Name,E-mail,Gender
2,,231,200,Monty,monty.python@gmail.com,m


### Slice rows

In [94]:
data = [{'Name': 'Aram', 'Age': 24, 'Height': 178},
        {'Name': 'Anna', 'Age': 54, 'Height': 199},
        {'Name': 'Monty', 'Age': 231, 'Height': 200, 'E-mail': 'monty.python@gmail.com'},
        {'Name': 'Sarah', 'Age': 21, 'Height': 170, 'E-mail': 'sarah@gmail.com'}]

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Height,E-mail
0,Aram,24,178,
1,Anna,54,199,
2,Monty,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com


In [96]:
df[0:3:2]

Unnamed: 0,Name,Age,Height,E-mail
0,Aram,24,178,
2,Monty,231,200,monty.python@gmail.com


## Some series and dataframe functions

In [97]:
s = pd.Series(np.random.random(10) * 6)

In [99]:
s.axes

[RangeIndex(start=0, stop=10, step=1)]

In [100]:
s.axes[0]

RangeIndex(start=0, stop=10, step=1)

In [101]:
s.index

RangeIndex(start=0, stop=10, step=1)

In [102]:
s

0    0.830660
1    5.688606
2    5.559525
3    0.406753
4    5.868887
5    3.348626
6    2.509786
7    5.609384
8    5.139526
9    3.446673
dtype: float64

In [103]:
s.dtype

dtype('float64')

In [104]:
df.dtypes

Name      object
Age        int64
Height     int64
E-mail    object
dtype: object

In [105]:
s.empty

False

In [106]:
s.ndim

1

In [107]:
df.ndim

2

In [109]:
print(df.shape)
df


(4, 4)


Unnamed: 0,Name,Age,Height,E-mail
0,Aram,24,178,
1,Anna,54,199,
2,Monty,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com


In [113]:
type(df.head(2))

pandas.core.frame.DataFrame

In [112]:
df.tail(2)

Unnamed: 0,Name,Age,Height,E-mail
2,Monty,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com


### Count the number of values in a column

In [119]:
df = df.append({'Name': 'Anna', 'Age': 42, 'Height': 178}, ignore_index=True)

In [120]:
df

Unnamed: 0,Name,Age,Height,E-mail
0,Aram,24,178,
1,Anna,54,199,
2,Monty,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com
4,Anna,42,178,


In [121]:
df['Name'].value_counts()

Anna     2
Sarah    1
Monty    1
Aram     1
Name: Name, dtype: int64

# Missing values

In [126]:
data = np.random.random(25).reshape(5, 5)

In [134]:
df_3 = pd.DataFrame(data, index=range(0, 10, 2), columns=['one', 'two', 'three', 'four', 'five'])

In [141]:
df_3

Unnamed: 0,one,two,three,four,five
0,0.730897,0.933293,0.439829,0.669741,0.595757
1,,,,,
2,0.557611,0.188567,0.596341,0.889225,0.416072
3,,,,,
4,0.504954,0.463887,0.336316,0.771672,0.038554
5,,,,,
6,0.150224,0.97494,0.576741,0.132581,0.552165
7,,,,,
8,0.35616,0.253132,0.389951,0.357658,0.275658
9,,,,,


In [139]:
df_3 = df_3.reindex(range(0, 10))

In [140]:
df_3['one'].isnull()

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
9     True
Name: one, dtype: bool

In [142]:
df_3['one'].isna()

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
9     True
Name: one, dtype: bool

In [144]:
df_3['one'].isnull().sum()

5

In [150]:
df_3['two'].notnull()

0     True
1    False
2     True
3    False
4     True
5    False
6     True
7    False
8     True
9    False
Name: two, dtype: bool

In [146]:
df_3['one'].notnull().sum()

5

In [149]:
df_3.notnull().sum().sum()

25

In [151]:
df_3.notna()

Unnamed: 0,one,two,three,four,five
0,True,True,True,True,True
1,False,False,False,False,False
2,True,True,True,True,True
3,False,False,False,False,False
4,True,True,True,True,True
5,False,False,False,False,False
6,True,True,True,True,True
7,False,False,False,False,False
8,True,True,True,True,True
9,False,False,False,False,False


# Replacing the missing data

In [152]:
df_3.fillna(method='bfill')

Unnamed: 0,one,two,three,four,five
0,0.730897,0.933293,0.439829,0.669741,0.595757
1,0.557611,0.188567,0.596341,0.889225,0.416072
2,0.557611,0.188567,0.596341,0.889225,0.416072
3,0.504954,0.463887,0.336316,0.771672,0.038554
4,0.504954,0.463887,0.336316,0.771672,0.038554
5,0.150224,0.97494,0.576741,0.132581,0.552165
6,0.150224,0.97494,0.576741,0.132581,0.552165
7,0.35616,0.253132,0.389951,0.357658,0.275658
8,0.35616,0.253132,0.389951,0.357658,0.275658
9,,,,,


In [157]:
df_3.fillna(np.mean(df_3))

Unnamed: 0,one,two,three,four,five
0,0.730897,0.933293,0.439829,0.669741,0.595757
1,0.459969,0.562764,0.467836,0.564175,0.375641
2,0.557611,0.188567,0.596341,0.889225,0.416072
3,0.459969,0.562764,0.467836,0.564175,0.375641
4,0.504954,0.463887,0.336316,0.771672,0.038554
5,0.459969,0.562764,0.467836,0.564175,0.375641
6,0.150224,0.97494,0.576741,0.132581,0.552165
7,0.459969,0.562764,0.467836,0.564175,0.375641
8,0.35616,0.253132,0.389951,0.357658,0.275658
9,0.459969,0.562764,0.467836,0.564175,0.375641


### Dropping the missing data

In [158]:
df_3.dropna()

Unnamed: 0,one,two,three,four,five
0,0.730897,0.933293,0.439829,0.669741,0.595757
2,0.557611,0.188567,0.596341,0.889225,0.416072
4,0.504954,0.463887,0.336316,0.771672,0.038554
6,0.150224,0.97494,0.576741,0.132581,0.552165
8,0.35616,0.253132,0.389951,0.357658,0.275658


### Replacing regular values

In [159]:
df

Unnamed: 0,Name,Age,Height,E-mail
0,Aram,24,178,
1,Anna,54,199,
2,Monty,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com
4,Anna,42,178,


In [162]:
df.replace({'Aram': 'Karen', 'Monty': 'Python', 24: 1000})

Unnamed: 0,Name,Age,Height,E-mail
0,Karen,1000,178,
1,Anna,54,199,
2,Python,231,200,monty.python@gmail.com
3,Sarah,21,170,sarah@gmail.com
4,Anna,42,178,


# Getting unique values in a column

In [163]:
df['Name'].unique()

array(['Aram', 'Anna', 'Monty', 'Sarah'], dtype=object)

In [164]:
len(df['Name'].unique())

4

In [165]:
df['Name'].nunique()

4

## Descriptive statistics

In [191]:
names = ['Tom','James','Ricky','Vin','Steve','Smith','Jack',
        'Lee','David','Gasper','Betina','Andres']

grades = np.random.random(12) * 4
grades = np.around(grades, decimals=2)
ages = np.random.randint(18, 30, 12)
print(grades)

df_4 = pd.DataFrame({'Name': names, 'Age': ages, 'Grade': grades})



[3.8  3.75 1.69 3.67 2.66 2.22 2.3  2.03 1.05 3.5  3.15 0.46]


## mean()
վերադարձնում է միջին արժեքը

In [168]:
df.mean()

Age      24.833333
Grade     2.135000
dtype: float64

In [169]:
df['Age'].mean()

24.833333333333332

In [171]:
df[['Age', 'Grade']].sum()

Age      298.00
Grade     25.62
dtype: float64

In [172]:
df['Grade'].sum()

25.619999999999997

## std()
Վերադարձնում է միջին քառակուսային շեղումը

In [173]:
df.std()

Age      3.880800
Grade    1.160936
dtype: float64

In [174]:
df['Age'].std()

3.88079966767238

## Summarizing Data
**describe()** վերադարձնում է dataframe-ի վիճակագրական տվյալները

In [175]:
df.describe()

Unnamed: 0,Age,Grade
count,12.0,12.0
mean,24.833333,2.135
std,3.8808,1.160936
min,18.0,0.32
25%,21.75,1.5525
50%,24.0,1.985
75%,29.0,2.7975
max,29.0,3.93


In [None]:
df['Age'].median()
df.median()

# Loading data from a file into a dataframe

In [182]:
from google.colab import files
import io

upload = files.upload()

df = pd.read_csv(io.StringIO(upload['water_potability.csv'].decode('utf-8')))
df.head()

KeyError: ignored

In [184]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


## Renaming rows and columns

In [187]:
s= df.rename(columns={'ph': 'pH'}, index={0: 'a'})

In [188]:
s.head()

Unnamed: 0,pH,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
a,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


## Reindexing

In [190]:
df_1

Unnamed: 0,Height,Age,E-mail
0,178,24,
1,199,54,
2,200,231,monty.python@gmail.com


In [192]:
df_4

Unnamed: 0,Name,Age,Grade
0,Tom,28,3.8
1,James,22,3.75
2,Ricky,18,1.69
3,Vin,18,3.67
4,Steve,27,2.66
5,Smith,27,2.22
6,Jack,20,2.3
7,Lee,19,2.03
8,David,25,1.05
9,Gasper,24,3.5


In [198]:
df_4 = df_4.reindex(index=[2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

In [202]:
df = df_4.reset_index(drop=True)

In [203]:
df

Unnamed: 0,Name,Age,Grade
0,Ricky,18.0,1.69
1,James,22.0,3.75
2,Vin,18.0,3.67
3,Steve,27.0,2.66
4,Smith,27.0,2.22
5,Jack,20.0,2.3
6,Lee,19.0,2.03
7,David,25.0,1.05
8,Gasper,24.0,3.5
9,Betina,22.0,3.15


## Changing the indexing

In [204]:
df.index = [2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [206]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Name,Age,Grade
0,Ricky,18.0,1.69
1,James,22.0,3.75
2,Vin,18.0,3.67
3,Steve,27.0,2.66
4,Smith,27.0,2.22


## Groupby

In [207]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
                     'Kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}

cr_df = pd.DataFrame(ipl_data)

In [210]:
gb = cr_df.groupby('Team')

In [211]:
gb.groups

{'Devils': [2, 3], 'Kings': [4, 5, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10]}

In [212]:
means = gb.mean()

In [213]:
means

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2.5,2014.5,768.0
Kings,2.25,2015.5,774.25
Riders,1.75,2015.5,762.25
Royals,2.5,2014.5,752.5


In [216]:
medians = gb.median()
medians

Unnamed: 0_level_0,Rank,Year,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,2.5,2014.5,768.0
Kings,2.0,2015.5,772.0
Riders,2.0,2015.5,741.5
Royals,2.5,2014.5,752.5


### View the groups

In [217]:
gb.groups

{'Devils': [2, 3], 'Kings': [4, 5, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10]}

## Aggregations
**agg()**-ը նշված սյան վրա կատարում է գործողություն (min, max, mean, etc.) և վերադարձնում է արժեք։ 

In [227]:
df_5 = gb.agg({'Rank': 'max', 'Points': 'mean'})

In [228]:
df_5.reset_index(inplace=True)

In [229]:
df_5

Unnamed: 0,Team,Rank,Points
0,Devils,3,768.0
1,Kings,4,774.25
2,Riders,2,762.25
3,Royals,4,752.5


In [230]:
complex_agg = gb.agg({'Rank': ['max', 'min'], 'Points': ['mean', 'std']})

In [231]:
complex_agg

Unnamed: 0_level_0,Rank,Rank,Points,Points
Unnamed: 0_level_1,max,min,mean,std
Team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Devils,3,2,768.0,134.350288
Kings,4,1,774.25,31.899582
Riders,2,1,762.25,88.567771
Royals,4,1,752.5,72.831998


In [233]:
complex_agg[('Rank', 'min')]

Team
Devils    2
Kings     1
Riders    1
Royals    1
Name: (Rank, min), dtype: int64

## Merging/Joining

In [234]:
df1 = pd.DataFrame({
         'id':[1,2,3,4,5],
         'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
         'subject_id':['sub1','sub2','sub4','sub6','sub5']})

df2 = pd.DataFrame(
         {'id':[1,2,3,4,5],
         'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
         'subject_id':['sub2','sub4','sub3','sub5','sub6']})



In [235]:
df1

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5


In [236]:
df2

Unnamed: 0,id,Name,subject_id
0,1,Billy,sub2
1,2,Brian,sub4
2,3,Bran,sub3
3,4,Bryce,sub5
4,5,Betty,sub6


In [240]:
df3 = pd.concat([df1, df2])
df3.reset_index(drop=True, inplace=True)

In [241]:
df3

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5
5,1,Billy,sub2
6,2,Brian,sub4
7,3,Bran,sub3
8,4,Bryce,sub5
9,5,Betty,sub6


In [242]:
df1 + df2

Unnamed: 0,id,Name,subject_id
0,2,AlexBilly,sub1sub2
1,4,AmyBrian,sub2sub4
2,6,AllenBran,sub4sub3
3,8,AliceBryce,sub6sub5
4,10,AyoungBetty,sub5sub6


### Merge Two DataFrames on a Key

In [243]:
pd.merge(df1, df2, on='subject_id')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,2,Amy,sub2,1,Billy
1,3,Allen,sub4,2,Brian
2,4,Alice,sub6,5,Betty
3,5,Ayoung,sub5,4,Bryce


In [244]:
pd.merge(df1, df2, on=['id', 'subject_id'])

Unnamed: 0,id,Name_x,subject_id,Name_y


### Left join

In [256]:
print(df1)
print('=' * 100)
print(df2)

   id    Name subject_id
0   1    Alex       sub1
1   2     Amy       sub2
2   3   Allen       sub4
3   4   Alice       sub6
4   5  Ayoung       sub5
   id   Name subject_id
0   1  Billy       sub2
1   2  Brian       sub4
2   3   Bran       sub3
3   4  Bryce       sub5
4   5  Betty       sub6


In [245]:
pd.merge(df1, df2, on='subject_id', how='left')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,1,Alex,sub1,,
1,2,Amy,sub2,1.0,Billy
2,3,Allen,sub4,2.0,Brian
3,4,Alice,sub6,5.0,Betty
4,5,Ayoung,sub5,4.0,Bryce


### Right join

In [247]:
df_6 = pd.merge(df1, df2, on='subject_id', how='right')


In [255]:
df_6

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,2.0,Amy,sub2,1,Billy
1,3.0,Allen,sub4,2,Brian
2,,,sub3,3,Bran
3,5.0,Ayoung,sub5,4,Bryce
4,4.0,Alice,sub6,5,Betty


In [252]:
df_6['id_x'] = pd.Series(df_6['id_x'], dtype=str)

In [254]:
df_6.dtypes

id_x          object
Name_x        object
subject_id    object
id_y           int64
Name_y        object
dtype: object

### Outer join

In [257]:
df_6 = pd.merge(df1, df2, on='subject_id', how='outer')


In [258]:
df_6

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,1.0,Alex,sub1,,
1,2.0,Amy,sub2,1.0,Billy
2,3.0,Allen,sub4,2.0,Brian
3,4.0,Alice,sub6,5.0,Betty
4,5.0,Ayoung,sub5,4.0,Bryce
5,,,sub3,3.0,Bran


### Inner join

In [261]:
df_6 = pd.merge(df1, df2, on='id', how='inner')


In [262]:
df_6

Unnamed: 0,id,Name_x,subject_id_x,Name_y,subject_id_y
0,1,Alex,sub1,Billy,sub2
1,2,Amy,sub2,Brian,sub4
2,3,Allen,sub4,Bran,sub3
3,4,Alice,sub6,Bryce,sub5
4,5,Ayoung,sub5,Betty,sub6


Data Visualization: https://matplotlib.org/

https://seaborn.pydata.org/

Kaggle, dataset library: https://www.kaggle.com/