### Importing dependensies, reading .csv, checking the shape, columns, data types and unique values

In [1]:
# Import dependecies
import pandas as pd
import numpy as np
from pathlib import Path
import json

In [2]:
# Read csv
movies_df = pd.read_csv(Path("movies.csv"))

movies_df.head()

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,76600,Avatar: The Way of Water,Science Fiction-Adventure-Action,en,Set more than a decade after the events of the...,9366.788,20th Century Studios-Lightstorm Entertainment,2022-12-14,350000000.0,2312336000.0,192.0,Released,Return to Pandora.,7.751,6748.0,Sam Worthington-Zoe Saldaña-Sigourney Weaver-S...,loss of loved one-dying and death-alien life-f...,/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,/s16H6tpK2utvwDtzZ8Qy4qm5Emw.jpg,183392-111332-702432-505642-1064215-436270-874...
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675820.0,103.0,Released,Inspired by the actual files of Father Gabriel...,7.433,545.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,spain-rome italy-vatican-pope-pig-possession-c...,/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,/hiHGRbyTcbZoLsYYkO4QiCLYe34.jpg,713704-296271-502356-1076605-1084225-1008005-9...
2,594767,Shazam! Fury of the Gods,Action-Comedy-Fantasy,en,Billy Batson and his foster siblings who trans...,5759.074,New Line Cinema-The Safran Company-DC Films-Wa...,2023-03-15,125000000.0,133004300.0,130.0,Released,Oh. My. Gods.,6.84,1355.0,Zachary Levi-Asher Angel-Jack Dylan Grazer-Ada...,superhero-end of the world-super power-aftercr...,/2VK4d3mqqTc7LVZLnLPeRiPaJ71.jpg,/wybmSmviUXxlBmX44gtpow5Y9TB.jpg,700391-994751-948713-640146-502356-938992-7660...
3,502356,The Super Mario Bros. Movie,Animation-Adventure-Family-Fantasy-Comedy,en,While working underground to fix a water main ...,5132.098,Universal Pictures-Illumination-Nintendo,2023-04-05,100000000.0,58000000.0,92.0,Released,,7.556,332.0,Chris Pratt-Anya Taylor-Joy-Charlie Day-Jack B...,video game-plumber-magic mushroom-based on vid...,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,/iw0Na1UBHgA5BgifwmQ8vKhlWgA.jpg,
4,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,Super-Hero partners Scott Lang and Hope van Dy...,4704.903,Marvel Studios-Kevin Feige Productions,2023-02-15,200000000.0,473237900.0,125.0,Released,Witness the beginning of a new dynasty.,6.448,1547.0,Paul Rudd-Evangeline Lilly-Jonathan Majors-Kat...,hero-ant-sequel-superhero-based on comic-famil...,/ngl2FKBlU4fhbdsrtdom9LVLBXw.jpg,/3CxUndGhUcZdt1Zggjdb2HkLLQX.jpg,965839-734048-267805-1035806-823999-842942-772...


In [3]:
# Checking the shape
movies_df.shape

(722894, 20)

In [4]:
# Checkingthe columns
movies_df.columns

Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'production_companies', 'release_date', 'budget', 'revenue', 'runtime',
       'status', 'tagline', 'vote_average', 'vote_count', 'credits',
       'keywords', 'poster_path', 'backdrop_path', 'recommendations'],
      dtype='object')

In [5]:
# Checking data types
movies_df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
budget                  float64
revenue                 float64
runtime                 float64
status                   object
tagline                  object
vote_average            float64
vote_count              float64
credits                  object
keywords                 object
poster_path              object
backdrop_path            object
recommendations          object
dtype: object

In [6]:
# Checking the unique values
movies_df.nunique()

id                      662083
title                   575362
genres                   11026
original_language          167
overview                541538
popularity               20065
production_companies    146898
release_date             41208
budget                    4255
revenue                  12913
runtime                    635
status                       6
tagline                  95426
vote_average              3148
vote_count                3490
credits                 422809
keywords                119228
poster_path             482423
backdrop_path           196341
recommendations          30700
dtype: int64

### Removing duplicates, NaN values and not needed columns

In [7]:
# Checking for the duplicates
movies_df[movies_df.duplicated()]

# We have only one full duplicate

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
199030,802661,Hellsing Ultimate,Horror-Action-Animation,ja,A live-action adaptation of the popular eponym...,1.146,Amazon Studios-Automatik Entertainment-Ranger ...,2006-02-10,0.0,0.0,0.0,Released,,8.0,1.0,,,,,


In [8]:
# Dropping the duplicate
movies_df = movies_df.drop_duplicates()

In [9]:
print(movies_df.shape)

(722893, 20)


In [10]:
# Checking the null values in each column
movies_df.isnull().sum()

id                           0
title                        4
genres                  210942
original_language            0
overview                118640
popularity                   0
production_companies    385727
release_date             52599
budget                       0
revenue                      0
runtime                  34460
status                       0
tagline                 614818
vote_average                 0
vote_count                   0
credits                 225151
keywords                512656
poster_path             185353
backdrop_path           500446
recommendations         688196
dtype: int64

In [11]:
# Checking the null values in the 'recommendation column'
movies_df[movies_df['recommendations'].isnull()]

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
3,502356,The Super Mario Bros. Movie,Animation-Adventure-Family-Fantasy-Comedy,en,While working underground to fix a water main ...,5132.098,Universal Pictures-Illumination-Nintendo,2023-04-05,100000000.0,58000000.0,92.0,Released,,7.556,332.0,Chris Pratt-Anya Taylor-Joy-Charlie Day-Jack B...,video game-plumber-magic mushroom-based on vid...,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,/iw0Na1UBHgA5BgifwmQ8vKhlWgA.jpg,
11,956101,The Eighth Clause,Thriller,la,Kat and Borja appear to be a perfect couple bu...,2259.303,SDB Films-El Hombre Orquesta,2022-04-29,0.0,0.0,0.0,Released,,4.600,10.0,Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic...,,/8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg,/kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg,
28,511617,Berlin Drifters,Drama,ja,Kôichi is a Japanese man living alone in Berli...,918.173,Jürgen Brüning Filmproduktion-Habakari Cinema,2017-09-10,0.0,0.0,123.0,Released,,5.000,9.0,Lyota Majima-Kôichi Imaizumi-Mioo Satô-Michael...,gay erotica,/aH0yHU915NibzznpB2Zks2c6Z9O.jpg,/8oBSFGFaHp2MtTMeuWZ2ugmx28m.jpg,
31,892070,Padre no hay más que uno 3,Comedy-Family,es,Christmas is coming. The children accidentally...,749.721,Atresmedia-Bowfinger,2022-07-15,0.0,0.0,0.0,Released,,8.500,2.0,Santiago Segura-Toni Acosta-Martina D’Antiochi...,,/rmFURMXlphmrGr8gItXkuHqg43G.jpg,/znUdSyO9ZUopUfmr6DH5YT5D5Cs.jpg,
32,770509,The Nights Belong to Monsters,Fantasy-Drama,es,Sol a 17-year-old teenager moves with her moth...,742.199,Rispo Films-Tieless Media,2021-10-09,0.0,0.0,96.0,Released,,2.000,1.0,Lu Grasso-Esteban Lamothe-Jazmín Stuart-Gustav...,,/cYNfI1WHxMm0JhCXEX9qUUOv8C3.jpg,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722889,680322,Mata Hari,,de,,0.600,,,0.0,0.0,0.0,Released,,0.000,0.0,Asta Nielsen,,,,
722890,547172,Yosemite — A Gathering of Spirit,Documentary,en,Burns’ Yosemite: A Gathering of Spirit a docum...,0.600,,2013-12-08,0.0,0.0,25.0,Released,,0.000,0.0,,,/9W83Mv9zfONDJ89Fxpm6hH6v7Pa.jpg,/dVxWVpuBdLuhCVgWKxXOdakcBrx.jpg,
722891,518756,The Eternal Life of Alexander Christoforov,Comedy-Adventure,ru,The misanthrope Alexander Christoforov nicknam...,0.600,58.5 Production,2018-10-18,0.0,0.0,104.0,Released,,6.300,3.0,Aleksey Guskov-Timofey Tribuntsev-Oksana Fande...,,/34AOD9KOtKSUiCLi88iDhoOBuHm.jpg,/4UTxLuedDDxFILt3Hmb4K6e5qay.jpg,
722892,600197,Amazônia Indomável,Documentary,pt,,0.600,,1952-01-01,0.0,0.0,0.0,Released,,0.000,0.0,Gaspar Coelho,indigenous-anthropology-ethnography,,,


In [12]:
# Dropping the 'recommendation' column
# movies_df = movies_df.drop(columns='recommendations')

In [13]:
movies_df.shape

(722893, 20)

In [14]:
# Dropping not needed columns
movies_df = movies_df.drop(columns=['backdrop_path', 'tagline', 'budget', 'revenue'])

In [15]:
# Dropping the rows that have NaN values

movies_df = movies_df.dropna(how='any')

In [16]:
# The Data Set decreased to 118635 records

movies_df.shape

(23433, 16)

In [17]:
movies_df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
runtime                 float64
status                   object
vote_average            float64
vote_count              float64
credits                  object
keywords                 object
poster_path              object
recommendations          object
dtype: object

In [18]:
# Checking the unique values again
movies_df.nunique()


id                      22400
title                   21446
genres                   3573
original_language          78
overview                22390
popularity              15220
production_companies    17572
release_date            11211
runtime                   261
status                      1
vote_average             2335
vote_count               3475
credits                 22245
keywords                20482
poster_path             22400
recommendations         21757
dtype: int64

We have more rows than unique id values. We have to check for more duplicates.

In [19]:
movies_df[movies_df['id'].duplicated()]

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,runtime,status,vote_average,vote_count,credits,keywords,poster_path,recommendations
143,785521,Battle: Freestyle,Romance-Drama,no,Amalie and Mikael lead their street dance team...,238.540,Friland Produksjon AS,2022-04-01,88.0,Released,5.1,30.0,Lisa Teige-Fabian Svegaard Tapia-Ellen Dorrit ...,paris france-based on novel or book-norway-dan...,/6D6QumiHEhnpZG12Ibjy2BxA6n4.jpg,818750-790525-800407-739993-946726-682344-9532...
168,785521,Battle: Freestyle,Romance-Drama,no,Amalie and Mikael lead their street dance team...,210.928,Friland Produksjon AS,2022-04-01,88.0,Released,5.1,30.0,Lisa Teige-Fabian Svegaard Tapia-Ellen Dorrit ...,paris france-based on novel or book-norway-dan...,/6D6QumiHEhnpZG12Ibjy2BxA6n4.jpg,818750-790525-800407-739993-946726-682344-9532...
783,871875,Escape the Undertaker,Horror-Mystery-Family,en,The Undertaker has set a trap for the decorate...,72.909,WWE Studios,2021-10-05,31.0,Released,5.2,48.0,Mark Calaway-Kofi Sarkodie-Mensah-Austin Watso...,interactive,/dSF7INctoCQ4iPFRiRkZjIQIIVa.jpg,567748-550988-602223-639721-512025-610253-3609...
993,698320,The Mad Hatter,Horror-Thriller,en,An eccentric professor takes four of his stude...,61.070,Conglomerate Media-Swen Studios,2021-01-29,90.0,Released,5.8,47.0,Armando Gutiérrez-Nick Miller-Samuel Caleb Wal...,fairy tale-爱丽丝仙境-惊悚,/lTUfalNirpq7SBASa6lnzs27iam.jpg,514847-591274-482373-751394
1302,480157,House of the Witch,Horror-TV Movie,en,A group of high-school kids set out to play a ...,51.607,Distilled Media,2017-10-07,90.0,Released,4.8,115.0,Emily Bader-Michelle Randolph-Jules Hartley-Ar...,witch-halloween-haunted house-prank,/cg6xjZOnGsYYqC7SH3o8oJV9Vr1.jpg,371608-429467-301846-375012-365995-340601-5326...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563172,447965,Guru,Drama-Action,te,A grumpy boxing coach takes on a young rebelli...,0.600,Y NOT Studios,2017-03-31,116.0,Released,7.1,8.0,Venkatesh-Ritika Singh-Mumtaz Sorcar-Nassar-Ta...,boxing trainer-woman director-boxing-women box...,/beYqbxGlwSK1A8ww7j4NR2uiRHN.jpg,53122
688563,496418,Sarah Kohr: Mord im Alten Land,Crime-TV Movie,de,An emergency call has Commissioner Sarah Kohr ...,0.600,Die Film GmbH,2018-04-23,88.0,Released,5.2,5.0,Lisa Maria Potthoff-Marcus Mittermeier-Herbert...,police,/t3a8fjuFMTEchLHKzpx5x7Zu0kO.jpg,701058-791283-602138-256551-1266-884972-14161
696755,438595,Bricks in Motion,Documentary,en,BRICKS IN MOTION is a feature length documenta...,0.600,Ergo Possum Productions-One Brick Studios-Mind...,2016-01-01,87.0,Released,6.7,3.0,Chris Boyer-Rachel Dew-Maxime Marion-James Mor...,stop motion-filmmaking-lego-hobbies,/dkyqH1FmxGihAZTr27DkYlR97P0.jpg,401113-586502-925753-926138-855647-926548-7833...
713494,524503,Petitet,Documentary,ca,Joan Ximénez el Petitet is a Catalan gypsy who...,0.600,Grifols-Lastor Media-Movistar+-SomAtents-TV3-T...,2018-06-08,99.0,Released,5.6,7.0,Petitet-Carles Benavent-Christina Scheppelmann...,barcelona spain-gypsy music-catalonia-money pr...,/waRqK463a8o9a6fls53IXRfnIv9.jpg,453191


In [20]:
movies_df.drop_duplicates(subset=['id', 'title'], inplace=True)


In [21]:
movies_df.shape

(22400, 16)

In [22]:
movies_df.nunique()

id                      22400
title                   21446
genres                   3573
original_language          78
overview                22390
popularity              14884
production_companies    17572
release_date            11211
runtime                   261
status                      1
vote_average             2335
vote_count               3475
credits                 22240
keywords                20482
poster_path             22400
recommendations         21749
dtype: int64

##### Checking the status of the movies

In [23]:
print(movies_df['status'].value_counts())

Released    22400
Name: status, dtype: int64


In [24]:
# Dropping the rows where the 'status' isn't "Released"
movies_df = movies_df[movies_df['status'] == 'Released']


In [25]:
movies_df['status'].nunique()

1

In [26]:
# Dropping the status column
movies_df = movies_df.drop(columns='status')

In [27]:
movies_df = movies_df.set_index('id').reset_index()


In [28]:
movies_df.head(50)

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,runtime,vote_average,vote_count,credits,keywords,poster_path,recommendations
0,76600,Avatar: The Way of Water,Science Fiction-Adventure-Action,en,Set more than a decade after the events of the...,9366.788,20th Century Studios-Lightstorm Entertainment,2022-12-14,192.0,7.751,6748.0,Sam Worthington-Zoe Saldaña-Sigourney Weaver-S...,loss of loved one-dying and death-alien life-f...,/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,183392-111332-702432-505642-1064215-436270-874...
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,103.0,7.433,545.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,spain-rome italy-vatican-pope-pig-possession-c...,/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,713704-296271-502356-1076605-1084225-1008005-9...
2,594767,Shazam! Fury of the Gods,Action-Comedy-Fantasy,en,Billy Batson and his foster siblings who trans...,5759.074,New Line Cinema-The Safran Company-DC Films-Wa...,2023-03-15,130.0,6.84,1355.0,Zachary Levi-Asher Angel-Jack Dylan Grazer-Ada...,superhero-end of the world-super power-aftercr...,/2VK4d3mqqTc7LVZLnLPeRiPaJ71.jpg,700391-994751-948713-640146-502356-938992-7660...
3,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,Super-Hero partners Scott Lang and Hope van Dy...,4704.903,Marvel Studios-Kevin Feige Productions,2023-02-15,125.0,6.448,1547.0,Paul Rudd-Evangeline Lilly-Jonathan Majors-Kat...,hero-ant-sequel-superhero-based on comic-famil...,/ngl2FKBlU4fhbdsrtdom9LVLBXw.jpg,965839-734048-267805-1035806-823999-842942-772...
4,677179,Creed III,Drama-Action,en,After dominating the boxing world Adonis Creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,116.0,7.262,1129.0,Michael B. Jordan-Tessa Thompson-Jonathan Majo...,philadelphia pennsylvania-husband wife relatio...,/cvsXj3I9Q2iyyIo95AecSd1tad7.jpg,965839-267805-943822-842942-1035806-823999-107...
5,631842,Knock at the Cabin,Horror-Mystery-Thriller,en,While vacationing at a remote cabin a young gi...,3422.537,Blinding Edge Pictures-Universal Pictures-Film...,2023-02-01,100.0,6.457,888.0,Dave Bautista-Jonathan Groff-Ben Aldridge-Kris...,based on novel or book-sacrifice-cabin-faith-e...,/dm06L9pxDOL9jNSK4Cb6y139rrG.jpg,1058949-646389-772515-505642-143970-667216-104...
6,447365,Guardians of the Galaxy Volume 3,Science Fiction-Adventure-Action,en,Peter Quill still reeling from the loss of Gam...,2740.512,Marvel Studios-Kevin Feige Productions,2023-05-03,150.0,8.3,568.0,Chris Pratt-Zoe Saldaña-Bradley Cooper-Dave Ba...,hero-sequel-superhero-based on comic-superhero...,/r2J02Z2OpNTctfOSN1Ydgii51I3.jpg,420808-868759-948713-640146-1084244-455476-603...
7,646389,Plane,Action-Adventure-Thriller,en,After a heroic job of successfully landing his...,2618.646,MadRiver Pictures-Di Bonaventura Pictures-G-BA...,2023-01-12,107.0,6.901,785.0,Gerard Butler-Mike Colter-Yoson An-Tony Goldwy...,pilot-airplane-philippines-held hostage-plane ...,/qi9r5xBgcc9KTxlOLjssEbDgO0J.jpg,505642-758769-864692-631842-1058949-925943-758...
8,505642,Black Panther: Wakanda Forever,Action-Adventure-Science Fiction,en,Queen Ramonda Shuri M’Baku Okoye and the Dora ...,2525.408,Marvel Studios,2022-11-09,162.0,7.338,3922.0,Letitia Wright-Lupita Nyong'o-Danai Gurira-Win...,loss of loved one-hero-sequel-superhero-based ...,/sv1xJUazXeYqALzczSZ3O6nkH75.jpg,436270-829280-76600-56969-312634-1037858-238-5...
9,934433,Scream VI,Horror-Mystery-Thriller,en,Following the latest Ghostface killings the fo...,2472.802,Radio Silence-Project X Entertainment-Spyglass...,2023-03-08,123.0,7.4,1007.0,Melissa Barrera-Jenna Ortega-Jasmin Savoy Brow...,new york city-mask-trauma-halloween-college-fa...,/wDWwtvkRRlgTiUr6TyLSMX8FCuZ.jpg,646385-804150-677179-631842-943822-824742-9762...


In [29]:
movies_df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
runtime                 float64
vote_average            float64
vote_count              float64
credits                  object
keywords                 object
poster_path              object
recommendations          object
dtype: object

In [30]:
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'])

In [31]:
movies_df.dtypes

id                               int64
title                           object
genres                          object
original_language               object
overview                        object
popularity                     float64
production_companies            object
release_date            datetime64[ns]
runtime                        float64
vote_average                   float64
vote_count                     float64
credits                         object
keywords                        object
poster_path                     object
recommendations                 object
dtype: object

### Saving to csv.

In [33]:
movies_df.to_csv('movies_data_cleaned.csv', index=False)