In [1]:
import pandas as pd

In [5]:
df_movies = pd.read_csv('IMDb movies.csv', low_memory=False)
df_ratings = pd.read_csv('IMDb ratings.csv')
df_ratings

Unnamed: 0,imdb_title_id,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,...,females_30age_avg_vote,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes
0,tt0000009,5.9,154,5.9,6.0,12,4,10,43,28,...,5.7,13.0,4.5,4.0,5.7,34.0,6.4,51.0,6.0,70.0
1,tt0000574,6.1,589,6.3,6.0,57,18,58,137,139,...,6.2,23.0,6.6,14.0,6.4,66.0,6.0,96.0,6.2,331.0
2,tt0001892,5.8,188,6.0,6.0,6,6,17,44,52,...,5.8,4.0,6.8,7.0,5.4,32.0,6.2,31.0,5.9,123.0
3,tt0002101,5.2,446,5.3,5.0,15,8,16,62,98,...,5.5,14.0,6.1,21.0,4.9,57.0,5.5,207.0,4.7,105.0
4,tt0002130,7.0,2237,6.9,7.0,210,225,436,641,344,...,7.3,82.0,7.4,77.0,6.9,139.0,7.0,488.0,7.0,1166.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85850,tt9908390,5.3,398,5.5,6.0,13,9,26,65,104,...,5.7,11.0,5.0,2.0,5.5,12.0,6.3,22.0,5.3,214.0
85851,tt9911196,7.7,724,7.9,8.0,65,139,288,170,42,...,8.0,47.0,7.3,30.0,7.0,6.0,6.8,13.0,7.7,388.0
85852,tt9911774,7.9,265,7.8,8.0,63,29,61,61,31,...,,,,,1.0,1.0,,,2.0,2.0
85853,tt9914286,6.4,194,9.4,10.0,176,0,2,2,1,...,,,7.0,1.0,4.0,3.0,1.7,5.0,5.8,5.0


In [6]:
# select columns
df_movies = df_movies[['imdb_title_id', 'title', 'year', 'genre', 'country']]
df_ratings = df_ratings[['imdb_title_id', 'total_votes', 'mean_vote']]

In [8]:
df_movies.head(3), df_ratings.head(3)

(  imdb_title_id                        title  year                    genre  \
 0     tt0000009                   Miss Jerry  1894                  Romance   
 1     tt0000574  The Story of the Kelly Gang  1906  Biography, Crime, Drama   
 2     tt0001892               Den sorte drøm  1911                    Drama   
 
             country  
 0               USA  
 1         Australia  
 2  Germany, Denmark  ,
   imdb_title_id  total_votes  mean_vote
 0     tt0000009          154        5.9
 1     tt0000574          589        6.3
 2     tt0001892          188        6.0)

# concat()

## Concatenate vertically

To concatenate vertically (along the rows) we should have columns in common between the 2 dataframes

In [10]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'id': ['E', 'F', 'G', 'F'],
                    'age': [40, 21, 19, 24]})

In [14]:
pd.concat([df1, df2], axis=0, ignore_index=True)

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
4,E,40
5,F,21
6,G,19
7,F,24


### Exercise

In [18]:
# extract a 50% sample of the original dataframe
df_sample = df_movies.sample(frac=0.5)

In [21]:
# shape of dataframes that we'll concatenate
df_sample.shape, df_movies.shape


((42928, 5), (85855, 5))

In [23]:
# concatenate and df_movies and df_sample (vertically along the rows)
df_concat_vertically = pd.concat([df_movies, df_sample], axis=0, ignore_index=True)


In [24]:
print(df_concat_vertically.shape)

(128783, 5)


## Concatenate horizontally

To concatenate horizontally (along the columns) we should have a common index between the 2 dataframes

In [25]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                    'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'job': ['Doctor', 'Statistician',
                            'Accountant', 'Developer']})

In [26]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,id,age,job
0,A,30,Doctor
1,B,23,Statistician
2,C,25,Accountant
3,D,22,Developer


### Exercise

In [27]:
# shape of dataframes that we'll concatenate
print(df_movies.shape)
print(df_ratings.shape)

(85855, 5)
(85855, 3)


In [31]:
df_movies.set_index('imdb_title_id', inplace=True)
df_ratings.set_index('imdb_title_id', inplace=True)

In [32]:
# concatenate df_movies and df_ratings on 'imdb_title_id' (horizontally along the columns)
df_concat_horizontally = pd.concat([df_movies, df_ratings], axis=1)

In [33]:
df_concat_horizontally.shape

(85855, 6)

In [34]:
df_concat_horizontally

Unnamed: 0_level_0,title,year,genre,country,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,Miss Jerry,1894,Romance,USA,154,5.9
tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,589,6.3
tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",188,6.0
tt0002101,Cleopatra,1912,"Drama, History",USA,446,5.3
tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,2237,6.9
...,...,...,...,...,...,...
tt9908390,Le lion,2020,Comedy,"France, Belgium",398,5.5
tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands,724,7.9
tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India,265,7.8
tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey,194,9.4
