In [1]:
import numpy as np
import pandas as pd
import os

# Merging and Concatenating Dataframes

In [30]:
# dataset 1

df_movies = pd.read_csv("IMDb movies.csv",low_memory=False).sort_values(by="imdb_title_id")
df_movies = df_movies[["imdb_title_id","title","year","genre","country"]]
df_movies

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt0002101,Cleopatra,1912,"Drama, History",USA
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium"
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey


In [31]:
# dataset 2

df_ratings = pd.read_csv("IMDb ratings.csv").sort_values(by="imdb_title_id")
df_ratings = df_ratings[["imdb_title_id","total_votes","mean_vote"]]
df_ratings

Unnamed: 0,imdb_title_id,total_votes,mean_vote
0,tt0000009,154,5.9
1,tt0000574,589,6.3
2,tt0001892,188,6.0
3,tt0002101,446,5.3
4,tt0002130,2237,6.9
...,...,...,...
85850,tt9908390,398,5.5
85851,tt9911196,724,7.9
85852,tt9911774,265,7.8
85853,tt9914286,194,9.4


In [32]:
# sample dataset 1

df_movies_sample = df_movies[df_movies.index.isin([0,2,4,6,8])]
df_movies_sample

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
6,tt0002423,Madame DuBarry,1919,"Biography, Drama, Romance",Germany
8,tt0002452,Independenta Romaniei,1912,"History, War",Romania


In [33]:
# sample dataset 2

df_ratings_sample = df_ratings[df_ratings.index.isin([0,1,2,3,4,5])]
df_ratings_sample

Unnamed: 0,imdb_title_id,total_votes,mean_vote
0,tt0000009,154,5.9
1,tt0000574,589,6.3
2,tt0001892,188,6.0
3,tt0002101,446,5.3
4,tt0002130,2237,6.9
5,tt0002199,484,5.8


# 1.  Concatenate - concat()

### 1.1   Concatenate (vertically or along the row, axis=0)

In [5]:
df1=pd.DataFrame({"id":["A","B","C","D"],"age":[30,23,25,22],"Gender":["M","F","M","M"]})
df1

Unnamed: 0,id,age,Gender
0,A,30,M
1,B,23,F
2,C,25,M
3,D,22,M


In [6]:
df2 = pd.DataFrame({"id":["E","F","G","A","B"],"age":[40,21,19,24,36],"Score":[85,70,96,100,90]})
df2

Unnamed: 0,id,age,Score
0,E,40,85
1,F,21,70
2,G,19,96
3,A,24,100
4,B,36,90


In [7]:
pd.concat([df1,df2],axis=0)  # by default axis=0 and outer join

Unnamed: 0,id,age,Gender,Score
0,A,30,M,
1,B,23,F,
2,C,25,M,
3,D,22,M,
0,E,40,,85.0
1,F,21,,70.0
2,G,19,,96.0
3,A,24,,100.0
4,B,36,,90.0


In [40]:
# avoid duplicate indexes

pd.concat([df1,df2],axis=0, ignore_index=True)

Unnamed: 0,id,age,Gender,Score
0,A,30,M,
1,B,23,F,
2,C,25,M,
3,D,22,M,
4,E,40,,85.0
5,F,21,,70.0
6,G,19,,96.0
7,A,24,,100.0
8,B,36,,90.0


In [11]:
pd.concat([df1.set_index("id"),df2.set_index("id")],axis=0)

Unnamed: 0_level_0,age,Gender,Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,30,M,
B,23,F,
C,25,M,
D,22,M,
E,40,,85.0
F,21,,70.0
G,19,,96.0
A,24,,100.0
B,36,,90.0


In [10]:
pd.concat([df1.set_index("id"),df2.set_index("id")],axis=0,ignore_index=True)

Unnamed: 0,age,Gender,Score
0,30,M,
1,23,F,
2,25,M,
3,22,M,
4,40,,85.0
5,21,,70.0
6,19,,96.0
7,24,,100.0
8,36,,90.0


In [9]:
pd.concat([df1,df2],axis=0,join="inner") # shows only non null values

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
0,E,40
1,F,21
2,G,19
3,A,24
4,B,36


In [16]:
df3 = pd.DataFrame({"id":["A","A","B","C","D"],"age":[30,23,25,22,45],"Gender":["M","F","M","M","F"]})
df3

Unnamed: 0,id,age,Gender
0,A,30,M
1,A,23,F
2,B,25,M
3,C,22,M
4,D,45,F


In [17]:
df4 = pd.DataFrame({"id":["E","F","G","A","B","B"],"age":[40,21,19,24,36,52],"Score":[85,70,96,100,90,75]})
df4

Unnamed: 0,id,age,Score
0,E,40,85
1,F,21,70
2,G,19,96
3,A,24,100
4,B,36,90
5,B,52,75


In [52]:
pd.concat([df3,df4], axis=0)

Unnamed: 0,id,age,Gender,Score
0,A,30,M,
1,A,23,F,
2,B,25,M,
3,C,22,M,
4,D,45,F,
0,E,40,,85.0
1,F,21,,70.0
2,G,19,,96.0
3,A,24,,100.0
4,B,36,,90.0


In [54]:
pd.concat([df3.set_index("id"),df4.set_index("id")], axis=0)

Unnamed: 0_level_0,age,Gender,Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,30,M,
A,23,F,
B,25,M,
C,22,M,
D,45,F,
E,40,,85.0
F,21,,70.0
G,19,,96.0
A,24,,100.0
B,36,,90.0


In [31]:
# concatenate vertically df_movies_sample and df_ratings_sample

pd.concat([df_movies_sample,df_ratings_sample],axis=0,ignore_index=True)

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894.0,Romance,USA,,
1,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",,
2,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,,
3,tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,
4,tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,
5,tt0000009,,,,,154.0,5.9
6,tt0000574,,,,,589.0,6.3
7,tt0001892,,,,,188.0,6.0
8,tt0002101,,,,,446.0,5.3
9,tt0002130,,,,,2237.0,6.9


In [32]:
# setting common index and concatenate vertically df_movies_sample and df_ratings_sample

pd.concat([df_movies_sample.set_index("imdb_title_id"),df_ratings_sample.set_index("imdb_title_id")],axis=0)

Unnamed: 0_level_0,title,year,genre,country,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,Miss Jerry,1894.0,Romance,USA,,
tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",,
tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,,
tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,
tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,
tt0000009,,,,,154.0,5.9
tt0000574,,,,,589.0,6.3
tt0001892,,,,,188.0,6.0
tt0002101,,,,,446.0,5.3
tt0002130,,,,,2237.0,6.9


# 1.2  Concatenate (horizontally or along the column, axis=1)

In [55]:
pd.concat([df1,df2],axis=1)

Unnamed: 0,id,age,Gender,id.1,age.1,Score
0,A,30.0,M,E,40,85
1,B,23.0,F,F,21,70
2,C,25.0,M,G,19,96
3,D,22.0,M,A,24,100
4,,,,B,36,90


Indexes (important!)

df1 index → [0, 1, 2, 3]

df2 index → [0, 1, 2, 3, 4]

###### Key rules applied:

- Concatenate columns side-by-side

- Align rows by index, NOT by id

- Uses outer join on index (default)

- Missing values → NaN

###### Conclusion:

- The id column is not used to match rows

- Pandas only matches by index position

- So "A" in df1 is not matched with "A" in df2

In [56]:
pd.concat([df1,df2],axis=1, join="inner")
# pd.concat([df1,df2],axis=1, join="outer") 'outer' by default

Unnamed: 0,id,age,Gender,id.1,age.1,Score
0,A,30,M,E,40,85
1,B,23,F,F,21,70
2,C,25,M,G,19,96
3,D,22,M,A,24,100


In [57]:
# indexing the dataframes and then concating

pd.concat([df1.set_index("id"),df2.set_index("id")],axis=1)

Unnamed: 0_level_0,age,Gender,age,Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,30.0,M,24.0,100.0
B,23.0,F,36.0,90.0
C,25.0,M,,
D,22.0,M,,
E,,,40.0,85.0
F,,,21.0,70.0
G,,,19.0,96.0


In [60]:
pd.concat([df1.set_index("id"),df2.set_index("id")],axis=1, join="inner")

Unnamed: 0_level_0,age,Gender,age,Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,30,M,24,100
B,23,F,36,90


In [73]:
pd.concat([df3,df4],axis=1)

Unnamed: 0,id,age,Gender,id.1,age.1,Score
0,A,30.0,M,E,40,85
1,A,23.0,F,F,21,70
2,B,25.0,M,G,19,96
3,C,22.0,M,A,24,100
4,D,45.0,F,B,36,90
5,,,,B,52,75


In [28]:
# pd.concat([df3.set_index("id"),df4.set_index("id")],axis=1)

# shows error as there is duplicate index

#### What pandas does:

- Concatenates columns

- Aligns rows by index (id)

- Uses outer join on index

- Does NOT create combinations for duplicate keys

#### Why this happens:

- concat(axis=1) with duplicate indexes:

- Pandas aligns row-by-row, not many-to-many

- It matches only the first occurrence per index

- Extra duplicate rows remain unmatched → NaN

- concat cannot do relational joins with duplicates

#### What you probably expected:

- If you expected all combinations for A and B (many-to-many join), you must use merge():

#### Rule to remember:

- If indexes are unique → concat(axis=1) is OK
If duplicates exist → use merge()

In [34]:
# concatenate horizontally df_movies_sample and df_ratings_sample

pd.concat([df_movies_sample,df_ratings_sample],axis=1)

Unnamed: 0,imdb_title_id,title,year,genre,country,imdb_title_id.1,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894.0,Romance,USA,tt0000009,154.0,5.9
2,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",tt0001892,188.0,6.0
4,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,tt0002130,2237.0,6.9
6,tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,,
8,tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,,
1,,,,,,tt0000574,589.0,6.3
3,,,,,,tt0002101,446.0,5.3
5,,,,,,tt0002199,484.0,5.8


In [76]:
# concatenate horizontally df_movies_sample and df_ratings_sample

pd.concat([df_movies_sample.set_index("imdb_title_id"),df_ratings_sample.set_index("imdb_title_id")],axis=1)

Unnamed: 0_level_0,title,year,genre,country,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,Miss Jerry,1894.0,Romance,USA,154.0,5.9
tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",188.0,6.0
tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,2237.0,6.9
tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,
tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,
tt0000574,,,,,589.0,6.3
tt0002101,,,,,446.0,5.3
tt0002199,,,,,484.0,5.8


# 2)  Joins

## 2.1)  Inner Join

Return common matching data of two dataframes - intersection of two dataframes

In [100]:
df1=pd.DataFrame({"id":["A","B","C","D"],"age":[30,23,25,22]})
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [101]:
df2=pd.DataFrame({"id":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]})
df2

Unnamed: 0,id,job,salary
0,C,Doctor,30000
1,D,Statistician,20000
2,E,Accountant,10000
3,F,Developer,40000


In [105]:
df1.merge(df2,on="id",how="inner")  # default value of 'keep' is 'inner'

Unnamed: 0,id,age,job,salary
0,C,25,Doctor,30000
1,D,22,Statistician,20000


In [103]:
df2.merge(df1,on="id",how="inner")

Unnamed: 0,id,job,salary,age
0,C,Doctor,30000,25
1,D,Statistician,20000,22


In [35]:
# Exercise

# merge df_movies_sample and df_ratings_sample based on "imdb_title_id"

df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="inner")

# pd.merge(df_movies,df_ratings, on="imdb_title_id",how="inner") - another way

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894,Romance,USA,154,5.9
1,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",188,6.0
2,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,2237,6.9


## 2.2) Outer join (Full join)

Returns common and full data of both dataframes - union of two databases

In [83]:
df1=pd.DataFrame({"id":["A","B","C","D"],"age":[30,23,25,22]})
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [84]:
df2=pd.DataFrame({"id":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]})
df2

Unnamed: 0,id,job,salary
0,C,Doctor,30000
1,D,Statistician,20000
2,E,Accountant,10000
3,F,Developer,40000


In [86]:
df1.merge(df2,on="id",how="outer")

Unnamed: 0,id,age,job,salary
0,A,30.0,,
1,B,23.0,,
2,C,25.0,Doctor,30000.0
3,D,22.0,Statistician,20000.0
4,E,,Accountant,10000.0
5,F,,Developer,40000.0


In [87]:
df2.merge(df1,on="id",how="outer")

Unnamed: 0,id,job,salary,age
0,A,,,30.0
1,B,,,23.0
2,C,Doctor,30000.0,25.0
3,D,Statistician,20000.0,22.0
4,E,Accountant,10000.0,
5,F,Developer,40000.0,


In [36]:
# Exercise

# merge df_movies_sample and df_ratings_sample based on "imdb_title_id"

df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="outer")

# pd.merge(df_movies_sample,df_ratings_sample, on="imdb_title_id",how="outer")   # another way

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894.0,Romance,USA,154.0,5.9
1,tt0000574,,,,,589.0,6.3
2,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",188.0,6.0
3,tt0002101,,,,,446.0,5.3
4,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,2237.0,6.9
5,tt0002199,,,,,484.0,5.8
6,tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,
7,tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,


### 2.2.1) Exclusive full join

returns all data excluding the common part

In [37]:
df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="outer",indicator=True)  # a new column _merge is formed

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote,_merge
0,tt0000009,Miss Jerry,1894.0,Romance,USA,154.0,5.9,both
1,tt0000574,,,,,589.0,6.3,right_only
2,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",188.0,6.0,both
3,tt0002101,,,,,446.0,5.3,right_only
4,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,2237.0,6.9,both
5,tt0002199,,,,,484.0,5.8,right_only
6,tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,,left_only
7,tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,,left_only


In [38]:
df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="outer",indicator=True).query("_merge != 'both'")

# df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="outer",indicator=True).query("_merge == 'left_only' or _merge == 'right_only'")

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote,_merge
1,tt0000574,,,,,589.0,6.3,right_only
3,tt0002101,,,,,446.0,5.3,right_only
5,tt0002199,,,,,484.0,5.8,right_only
6,tt0002423,Madame DuBarry,1919.0,"Biography, Drama, Romance",Germany,,,left_only
7,tt0002452,Independenta Romaniei,1912.0,"History, War",Romania,,,left_only


## 2.3) Left join

Return common data and data of the first dataframe

In [88]:
df1=pd.DataFrame({"id":["A","B","C","D"],"age":[30,23,25,22]})
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [90]:
df2=pd.DataFrame({"id":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]})
df2

Unnamed: 0,id,job,salary
0,C,Doctor,30000
1,D,Statistician,20000
2,E,Accountant,10000
3,F,Developer,40000


In [91]:
df1.merge(df2,on="id",how="left")

Unnamed: 0,id,age,job,salary
0,A,30,,
1,B,23,,
2,C,25,Doctor,30000.0
3,D,22,Statistician,20000.0


In [92]:
df2.merge(df1,on="id",how="left")

Unnamed: 0,id,job,salary,age
0,C,Doctor,30000,25.0
1,D,Statistician,20000,22.0
2,E,Accountant,10000,
3,F,Developer,40000,


In [39]:
# Exercise

# merge df_movies_sample and df_ratings_sample based on "imdb_title_id"

df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="left")

# pd.merge(df_movies_sample,df_ratings_sample, on="imdb_title_id",how="left")   # another way

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894,Romance,USA,154.0,5.9
1,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",188.0,6.0
2,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,2237.0,6.9
3,tt0002423,Madame DuBarry,1919,"Biography, Drama, Romance",Germany,,
4,tt0002452,Independenta Romaniei,1912,"History, War",Romania,,


In [40]:
df_ratings_sample.merge(df_movies_sample,on="imdb_title_id",how="left")

Unnamed: 0,imdb_title_id,total_votes,mean_vote,title,year,genre,country
0,tt0000009,154,5.9,Miss Jerry,1894.0,Romance,USA
1,tt0000574,589,6.3,,,,
2,tt0001892,188,6.0,Den sorte drøm,1911.0,Drama,"Germany, Denmark"
3,tt0002101,446,5.3,,,,
4,tt0002130,2237,6.9,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy
5,tt0002199,484,5.8,,,,


#### 2.3.1)  Exclusive left join

In [41]:
df_ratings_sample.merge(df_movies_sample,on="imdb_title_id",how="left",indicator=True)

Unnamed: 0,imdb_title_id,total_votes,mean_vote,title,year,genre,country,_merge
0,tt0000009,154,5.9,Miss Jerry,1894.0,Romance,USA,both
1,tt0000574,589,6.3,,,,,left_only
2,tt0001892,188,6.0,Den sorte drøm,1911.0,Drama,"Germany, Denmark",both
3,tt0002101,446,5.3,,,,,left_only
4,tt0002130,2237,6.9,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,both
5,tt0002199,484,5.8,,,,,left_only


In [42]:
df_ratings_sample.merge(df_movies_sample,on="imdb_title_id",how="left",indicator=True).query("_merge == 'left_only'")

Unnamed: 0,imdb_title_id,total_votes,mean_vote,title,year,genre,country,_merge
1,tt0000574,589,6.3,,,,,left_only
3,tt0002101,446,5.3,,,,,left_only
5,tt0002199,484,5.8,,,,,left_only


## 2.4) Right join

Return common data and data of the second dataframe

In [93]:
df1=pd.DataFrame({"id":["A","B","C","D"],"age":[30,23,25,22]})
df1

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22


In [94]:
df2=pd.DataFrame({"id":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]})
df2

Unnamed: 0,id,job,salary
0,C,Doctor,30000
1,D,Statistician,20000
2,E,Accountant,10000
3,F,Developer,40000


In [95]:
df1.merge(df2,on="id",how="right")

Unnamed: 0,id,age,job,salary
0,C,25.0,Doctor,30000
1,D,22.0,Statistician,20000
2,E,,Accountant,10000
3,F,,Developer,40000


In [97]:
df2.merge(df1,on="id",how="right")

Unnamed: 0,id,job,salary,age
0,A,,,30
1,B,,,23
2,C,Doctor,30000.0,25
3,D,Statistician,20000.0,22


In [43]:
#Exercise

# merge df_movies_sample and df_ratings_sample based on "imdb_title_id"

df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="right")

# pd.merge(df_movies_sample,df_ratings_sample, on="imdb_title_id",how="left")   # another way

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote
0,tt0000009,Miss Jerry,1894.0,Romance,USA,154,5.9
1,tt0000574,,,,,589,6.3
2,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",188,6.0
3,tt0002101,,,,,446,5.3
4,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,2237,6.9
5,tt0002199,,,,,484,5.8


In [44]:
df_ratings_sample.merge(df_movies_sample,on="imdb_title_id",how="right")

Unnamed: 0,imdb_title_id,total_votes,mean_vote,title,year,genre,country
0,tt0000009,154.0,5.9,Miss Jerry,1894,Romance,USA
1,tt0001892,188.0,6.0,Den sorte drøm,1911,Drama,"Germany, Denmark"
2,tt0002130,2237.0,6.9,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
3,tt0002423,,,Madame DuBarry,1919,"Biography, Drama, Romance",Germany
4,tt0002452,,,Independenta Romaniei,1912,"History, War",Romania


### 2.4.1) Exclusive right join


In [45]:
df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="right",indicator=True)

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote,_merge
0,tt0000009,Miss Jerry,1894.0,Romance,USA,154,5.9,both
1,tt0000574,,,,,589,6.3,right_only
2,tt0001892,Den sorte drøm,1911.0,Drama,"Germany, Denmark",188,6.0,both
3,tt0002101,,,,,446,5.3,right_only
4,tt0002130,L'Inferno,1911.0,"Adventure, Drama, Fantasy",Italy,2237,6.9,both
5,tt0002199,,,,,484,5.8,right_only


In [46]:
#df_movies.merge(df_ratings,on="imdb_title_id",how="right",indicator=True).query("_merge == 'right_only'")

df_movies_sample.merge(df_ratings_sample,on="imdb_title_id",how="outer",indicator=True).query("_merge == 'right_only'") # can also use this

Unnamed: 0,imdb_title_id,title,year,genre,country,total_votes,mean_vote,_merge
1,tt0000574,,,,,589.0,6.3,right_only
3,tt0002101,,,,,446.0,5.3,right_only
5,tt0002199,,,,,484.0,5.8,right_only


In [47]:
# Exercise - make a copy of the dataframe and set the first 5 values of 'imdb_title_id' as 'tt1234567890'

df_movies1 = df_movies.copy()

for index in df_movies1.index:
    if index < 5:
        df_movies1.loc[index,"imdb_title_id"] = "tt1234567890"
df_movies1

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt1234567890,Miss Jerry,1894,Romance,USA
1,tt1234567890,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt1234567890,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt1234567890,Cleopatra,1912,"Drama, History",USA
4,tt1234567890,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...,...
85850,tt9908390,Le lion,2020,Comedy,"France, Belgium"
85851,tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands
85852,tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India
85853,tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey


#### Merging dataframes on different column names

In [98]:
df1=pd.DataFrame({"id1":["A","B","C","D"],"age":[30,23,25,22]})
df1

Unnamed: 0,id1,age
0,A,30
1,B,23
2,C,25
3,D,22


In [99]:
df2=pd.DataFrame({"id2":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]})
df2

Unnamed: 0,id2,job,salary
0,C,Doctor,30000
1,D,Statistician,20000
2,E,Accountant,10000
3,F,Developer,40000


In [100]:
df1.merge(df2, left_on= "id1", right_on = "id2", how = "inner")

# the same can done for outer/left/right/exclusive joins

Unnamed: 0,id1,age,id2,job,salary
0,C,25,C,Doctor,30000
1,D,22,D,Statistician,20000


In [101]:
pd.merge(df1, df2, left_on= "id1", right_on = "id2", how = "inner")

# the same can done for outer/left/right/exclusive joins

Unnamed: 0,id1,age,id2,job,salary
0,C,25,C,Doctor,30000
1,D,22,D,Statistician,20000


## join() method

In [110]:
df5=pd.DataFrame({"id1":["A","B","C","D"],"age":[30,23,25,22]}).set_index("id1")
df5

Unnamed: 0_level_0,age
id1,Unnamed: 1_level_1
A,30
B,23
C,25
D,22


In [111]:
df6=pd.DataFrame({"id2":["C","D","E","F"],"job":["Doctor","Statistician","Accountant","Developer"],"salary":[30000,20000,10000,40000]}).set_index("id2")
df6

Unnamed: 0_level_0,job,salary
id2,Unnamed: 1_level_1,Unnamed: 2_level_1
C,Doctor,30000
D,Statistician,20000
E,Accountant,10000
F,Developer,40000


In [112]:
df5.join(df6, how = "inner")

Unnamed: 0,age,job,salary
C,25,Doctor,30000
D,22,Statistician,20000


In [75]:
# error will occur, as this syntax is not possible

# pd.join(df1, df2, how = "inner")