# Data Cleaning - Tidy up messy Datasets (The Movie Database)

In [1]:
import pandas as pd

movies_raw = pd.read_csv('movies_raw.csv', low_memory=False)
movies_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  180 non-null    bool   
 1   backdrop_path          178 non-null    object 
 2   belongs_to_collection  108 non-null    object 
 3   budget                 180 non-null    int64  
 4   genres                 180 non-null    object 
 5   homepage               143 non-null    object 
 6   id                     180 non-null    int64  
 7   imdb_id                179 non-null    object 
 8   origin_country         180 non-null    object 
 9   original_language      180 non-null    object 
 10  original_title         180 non-null    object 
 11  overview               180 non-null    object 
 12  popularity             180 non-null    float64
 13  poster_path            180 non-null    object 
 14  production_companies   180 non-null    object 
 15  produc

## Dropping irrelevant Columns

2. __Drop__ the irrelevant columns 'adult', 'imdb_id', 'original_title', 'video' and 'homepage'.

In [2]:
movies_raw.drop(columns = ['adult', 'imdb_id', 'original_title', 'video', 'homepage', 'backdrop_path'], inplace=True)
movies_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  108 non-null    object 
 1   budget                 180 non-null    int64  
 2   genres                 180 non-null    object 
 3   id                     180 non-null    int64  
 4   origin_country         180 non-null    object 
 5   original_language      180 non-null    object 
 6   overview               180 non-null    object 
 7   popularity             180 non-null    float64
 8   poster_path            180 non-null    object 
 9   production_companies   180 non-null    object 
 10  production_countries   180 non-null    object 
 11  release_date           180 non-null    object 
 12  revenue                180 non-null    int64  
 13  runtime                180 non-null    int64  
 14  spoken_languages       180 non-null    object 
 15  status

In [3]:
movies_raw.head(2)

Unnamed: 0,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,,0,"[{'id': 10749, 'name': 'Romance'}]",259872,['NL'],nl,Real-life young couple Wim and Floor spend an ...,394.968,/uCkANtG6ezb7hjRKVudY3PUcbvn.jpg,"[{'id': 10422, 'logo_path': None, 'name': 'Blu...","[{'iso_3166_1': 'NL', 'name': 'Netherlands'}]",2010-02-19,0,60,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,,Skin. Like. Sun.,6.5,31
1,"{'id': 420, 'name': 'The Chronicles of Narnia ...",180000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",411,"['GB', 'US']",en,"Siblings Lucy, Edmund, Susan and Peter step th...",359.436,/iREd0rNCjYdf5Ar0vfaW32yrkm.jpg,"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfy...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2005-12-07,745013115,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The beloved masterpiece comes to life.,"The Chronicles of Narnia: The Lion, the Witch ...",7.134,10727


## Handling stringified JSON columns

In [4]:
movies_raw.belongs_to_collection[1]

"{'id': 420, 'name': 'The Chronicles of Narnia Collection', 'poster_path': '/sh6Kn8VBfXotJ6qsvJkdfscxXKR.jpg', 'backdrop_path': '/ojjzZUQlqKTsN1T7s5OAVZSjYMH.jpg'}"

In [5]:
movies_raw.genres[1]

"[{'id': 12, 'name': 'Adventure'}, {'id': 10751, 'name': 'Family'}, {'id': 14, 'name': 'Fantasy'}]"

In [6]:
movies_raw.origin_country[1]

"['GB', 'US']"

In [7]:
movies_raw.production_companies[1]

"[{'id': 2, 'logo_path': '/wdrCwmRnLFJhEoH8GSfymY85KHT.png', 'name': 'Walt Disney Pictures', 'origin_country': 'US'}, {'id': 10221, 'logo_path': '/99VfWRgKasZoyK9UVB39gnYvFrZ.png', 'name': 'Walden Media', 'origin_country': 'US'}, {'id': 79503, 'logo_path': None, 'name': 'C.S. Lewis Company', 'origin_country': 'GB'}]"

In [8]:
movies_raw.production_countries[1]

"[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]"

In [9]:
movies_raw.spoken_languages[1]

"[{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}, {'english_name': 'German', 'iso_639_1': 'de', 'name': 'Deutsch'}]"

In [10]:
import json
import ast
import numpy as np

In [11]:
json.loads(movies_raw.genres[0].replace("'", '"'))

[{'id': 10749, 'name': 'Romance'}]

In [12]:
ast.literal_eval(movies_raw.genres[1])

[{'id': 12, 'name': 'Adventure'},
 {'id': 10751, 'name': 'Family'},
 {'id': 14, 'name': 'Fantasy'}]

In [13]:
'|'.join(i['name'] for i in ast.literal_eval(movies_raw.genres[1]))

'Adventure|Family|Fantasy'

## Managing flatten nested Columns

In [14]:
movies_complete = movies_raw.copy()

In [15]:
def collection(x):
    if isinstance(x, str) == True:
       x = ast.literal_eval(x)
       return x['name']

    else:
       return np.nan

movies_complete['belongs_to_collection'] = movies_complete.belongs_to_collection.apply(collection)

In [16]:
movies_complete['belongs_to_collection'].value_counts().to_frame()

Unnamed: 0_level_0,count
belongs_to_collection,Unnamed: 1_level_1
Harry Potter Collection,8
Pirates of the Caribbean Collection,4
Shrek Collection,4
The Lord of the Rings Collection,3
The Twilight Collection,3
...,...
Zootopia Collection,1
The Hunger Games Collection,1
Iron Man Collection,1
Mulan Collection,1


In [17]:
movies_complete['genres'] = movies_complete.genres.apply(lambda x: '|'.join(i['name'] for i in ast.literal_eval(x)))

In [18]:
movies_complete['genres'].value_counts().to_frame()

Unnamed: 0_level_0,count
genres,Unnamed: 1_level_1
Adventure|Fantasy|Action,7
Adventure|Fantasy,7
Drama|Romance,6
Action|Adventure|Science Fiction,6
Drama,5
...,...
Science Fiction|Drama,1
Animation|Family|Comedy|Fantasy|Adventure|Romance,1
Family|Animation|Music|Adventure,1
Animation|Family|Adventure|Fantasy,1


In [19]:
movies_complete.origin_country = movies_complete.origin_country.apply(lambda x:'|'.join(i for i in ast.literal_eval(x)))

In [20]:
movies_complete.origin_country.to_frame()

Unnamed: 0,origin_country
0,NL
1,GB|US
2,US
3,US
4,FR
...,...
175,US
176,US
177,US
178,US


In [21]:
def companies(x):
    for i in x:
        if isinstance(x, str) == True:
           x = '|'.join(i['name'] for i in ast.literal_eval(x))
           return x

        else:
           return np.nan

movies_complete.production_companies = movies_complete.production_companies.apply(companies)

In [22]:
movies_complete.production_companies.value_counts().to_frame()

Unnamed: 0_level_0,count
production_companies,Unnamed: 1_level_1
Pixar,5
Pixar|Walt Disney Pictures,4
Warner Bros. Pictures|Heyday Films,4
DreamWorks Animation,4
New Line Cinema|WingNut Films|The Saul Zaentz Company,3
...,...
Alcon Entertainment|Columbia Pictures|Scott Free Productions|Bud Yorkin Productions|Torridon Films|16:14 Entertainment|Thunderbird Entertainment,1
Lionsgate,1
Summit Entertainment|Temple Hill Entertainment|Maverick Films|Imprint Entertainment|Goldcrest|Aura Films,1
Walt Disney Pictures|Jerry Bruckheimer Films,1


In [23]:
movies_complete.production_countries = movies_complete.production_countries.apply(lambda x: '|'.join(i['name'] for i in ast.literal_eval(x)))
movies_complete.production_countries.value_counts().to_frame()

Unnamed: 0_level_0,count
production_countries,Unnamed: 1_level_1
United States of America,102
United Kingdom|United States of America,26
Japan,8
New Zealand|United States of America,5
Italy,4
South Korea,4
France|United States of America,3
Canada,3
Germany|United States of America,2
United States of America|United Kingdom,2


In [24]:
movies_complete.spoken_languages = movies_complete.spoken_languages.apply(lambda x: '|'.join(i['name'] for i in ast.literal_eval(x)))
movies_complete.spoken_languages.value_counts().to_frame()

Unnamed: 0_level_0,count
spoken_languages,Unnamed: 1_level_1
English,104
English|Español,10
日本語,8
English|Français,6
Italiano,4
한국어/조선말,3
English|Deutsch,3
English|普通话,2
Français|English,2
No Language,2


In [25]:
movies_complete.head()

Unnamed: 0,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,,0,Romance,259872,NL,nl,Real-life young couple Wim and Floor spend an ...,394.968,/uCkANtG6ezb7hjRKVudY3PUcbvn.jpg,Blue Artichoke Films,Netherlands,2010-02-19,0,60,No Language,Released,,Skin. Like. Sun.,6.5,31
1,The Chronicles of Narnia Collection,180000000,Adventure|Family|Fantasy,411,GB|US,en,"Siblings Lucy, Edmund, Susan and Peter step th...",359.436,/iREd0rNCjYdf5Ar0vfaW32yrkm.jpg,Walt Disney Pictures|Walden Media|C.S. Lewis C...,United Kingdom|United States of America,2005-12-07,745013115,143,English|Deutsch,Released,The beloved masterpiece comes to life.,"The Chronicles of Narnia: The Lion, the Witch ...",7.134,10727
2,,165000000,Adventure|Drama|Science Fiction,157336,US,en,The adventures of a group of explorers who mak...,338.901,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Legendary Pictures|Syncopy|Lynda Obst Productions,United Kingdom|United States of America,2014-11-05,701729206,169,English,Released,Mankind was born on Earth. It was never meant ...,Interstellar,8.448,36242
3,xXx Collection,70000000,Action|Adventure|Thriller|Crime,7451,US,en,Xander Cage is your standard adrenaline junkie...,304.922,/xeEw3eLeSFmJgXZzmF2Efww0q3s.jpg,Columbia Pictures|Original Film|Revolution Stu...,United States of America|Czech Republic,2002-08-09,277448382,124,Český|English|Deutsch|Español|Pусский,Released,A new breed of secret agent.,xXx,5.937,4323
4,,0,Romance|Drama|Fantasy,11845,FR,en,Failed musician Jay abandoned his family and n...,244.899,/I0mlv1FahaLJqu2ByffMXEn4NN.jpg,ARTE France Cinéma|Téléma Productions|StudioCa...,France|Germany|United Kingdom,2001-01-20,2672527,119,English,Released,Every Wednesday. She meets him once per week.,Intimacy,6.3,509


## Cleaning Numerical Columns

In [26]:
print(movies_complete.budget.value_counts().head(1))
print(movies_complete.id.value_counts().head(1))
print(movies_complete.popularity.value_counts().head(1))
print(movies_complete.revenue.value_counts().head(1))
print(movies_complete.runtime.value_counts().head(1))

budget
0    22
Name: count, dtype: int64
id
259872    1
Name: count, dtype: int64
popularity
394.968    1
Name: count, dtype: int64
revenue
0    19
Name: count, dtype: int64
runtime
92    7
Name: count, dtype: int64


__Dealing with irregular values in budget and revenue. Converting in MUSD.__

In [27]:
movies_complete.budget = movies_complete.budget.replace(0, np.nan)
movies_complete.budget.value_counts(dropna=False).head()

budget
NaN            22
150000000.0    10
40000000.0      6
250000000.0     6
175000000.0     5
Name: count, dtype: int64

In [28]:
movies_complete.revenue = movies_complete.revenue.replace(0, np.nan)
movies_complete.revenue.value_counts(dropna=False).head()

revenue
NaN             19
1.429401e+08     1
7.828373e+08     1
1.067316e+09     1
6.602333e+07     1
Name: count, dtype: int64

In [29]:
movies_complete.revenue = movies_complete.revenue.div(1000000)
movies_complete.budget = movies_complete.budget.div(1000000)

__Searching for irregular values in vote_count and vote_avergae.__

In [30]:
movies_complete[['vote_count','vote_average']].describe()

Unnamed: 0,vote_count,vote_average
count,180.0,180.0
mean,13079.116667,7.337128
std,8349.834936,0.881935
min,6.0,4.0
25%,7109.0,6.81775
50%,13089.5,7.5395
75%,18551.5,7.93775
max,36912.0,8.709


## Cleaning DateTime Columns

In [31]:
movies_complete.release_date = pd.to_datetime(movies_complete.release_date, errors='coerce')
movies_complete.release_date

0     2010-02-19
1     2005-12-07
2     2014-11-05
3     2002-08-09
4     2001-01-20
         ...    
175   2008-06-04
176   2003-10-10
177   2011-05-25
178   2002-12-16
179   2016-02-19
Name: release_date, Length: 180, dtype: datetime64[ns]

## Text / String Columns

In [32]:
movies_complete.overview.value_counts(dropna=False).to_frame()

Unnamed: 0_level_0,count
overview,Unnamed: 1_level_1
"Real-life young couple Wim and Floor spend an afternoon in the sunwashed rooms of a crumbling home in Belgium. In a unique twist, this artistic erotic documentary is edited in nearly real-time. In the slowness, we get the build, the sweetness, and the sexiness. Forget about fingersnapping fast editing. Slow is where it’s at.",1
"A small town girl is caught between dead-end jobs. A high-profile, successful man becomes wheelchair bound following an accident. The man decides his life is not worth living until the girl is hired for six months to be his new caretaker. Worlds apart and trapped together by circumstance, the two get off to a rocky start. But the girl becomes determined to prove to the man that life is worth living and as they embark on a series of adventures together, each finds their world changing in ways neither of them could begin to imagine.",1
"Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protect the late attorney's reputation and is subsequently hunted by the Gotham City Police Department. Eight years later, Batman encounters the mysterious Selina Kyle and the villainous Bane, a new terrorist leader who overwhelms Gotham's finest. The Dark Knight resurfaces to protect a city that has branded him an enemy.",1
"When self-centered Emperor Kuzco is turned into a llama by his scheming advisor, he is forced to rely on good-hearted peasant Pacha to get back home.",1
"The origin story of former Special Forces operative turned mercenary Wade Wilson, who, after being subjected to a rogue experiment that leaves him with accelerated healing powers, adopts the alter ego Deadpool. Armed with his new abilities and a dark, twisted sense of humor, Deadpool hunts down the man who nearly destroyed his life.",1
...,...
"Young princess Anna of Arendelle dreams about finding true love at her sister Elsa’s coronation. Fate takes her on a dangerous journey in an attempt to end the eternal winter that has fallen over the kingdom. She's accompanied by ice delivery man Kristoff, his reindeer Sven, and snowman Olaf. On an adventure where she will find out what friendship, courage, family, and true love really means.",1
"As Lord Voldemort tightens his grip on both the Muggle and wizarding worlds, Hogwarts is no longer a safe haven. Harry suspects perils may even lie within the castle, but Dumbledore is more intent upon preparing him for the final battle fast approaching. Together they work to find the key to unlock Voldemorts defenses and to this end, Dumbledore recruits his old friend and colleague Horace Slughorn, whom he believes holds crucial information. Even as the decisive showdown looms, romance blossoms for Harry, Ron, Hermione and their classmates. Love is in the air, but danger lies ahead and Hogwarts may never be the same again.",1
"Despite his family’s baffling generations-old ban on music, Miguel dreams of becoming an accomplished musician like his idol, Ernesto de la Cruz. Desperate to prove his talent, Miguel finds himself in the stunning and colorful Land of the Dead following a mysterious chain of events. Along the way, he meets charming trickster Hector, and together, they set off on an extraordinary journey to unlock the real story behind Miguel's family history.",1
"Shrek, Fiona, and Donkey set off to Far, Far Away to meet Fiona's mother and father, the Queen and King. But not everyone is happily ever after. Shrek and the King find it difficult to get along, and there's tension in the marriage. The Fairy Godmother discovers that Fiona has married Shrek instead of her son Prince Charming and plots to destroy their marriage.",1


In [33]:
movies_complete.tagline.value_counts(dropna=False).to_frame()

Unnamed: 0_level_0,count
tagline,Unnamed: 1_level_1
,18
Sooner or later everyone does.,1
No toy gets left behind.,1
The world is yours.,1
Boys will be boys. . . some longer than others.,1
...,...
The celebration of a lifetime.,1
Once upon another time...,1
The key to the future is finally unearthed.,1
The rebellion begins.,1


## Handling Missing Values & Removing Observations

In [34]:
movies_complete.id.value_counts()

id
259872    1
296096    1
49026     1
11688     1
293660    1
         ..
109445    1
767       1
354912    1
809       1
310131    1
Name: count, Length: 180, dtype: int64

In [35]:
movies_complete.title.value_counts()

title
Cinderella                                2
Skin. Like. Sun.                          1
John Wick: Chapter 2                      1
The Dark Knight Rises                     1
The Emperor's New Groove                  1
                                         ..
Frozen                                    1
Harry Potter and the Half-Blood Prince    1
Coco                                      1
Shrek 2                                   1
The Witch                                 1
Name: count, Length: 179, dtype: int64

In [38]:
movies_complete[movies_complete.isna().sum(axis=1) > 3]

Unnamed: 0,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,,,Romance,259872,NL,nl,Real-life young couple Wim and Floor spend an ...,394.968,/uCkANtG6ezb7hjRKVudY3PUcbvn.jpg,Blue Artichoke Films,Netherlands,2010-02-19,,60,No Language,Released,,Skin. Like. Sun.,6.5,31
18,,,Animation|Comedy|Action,324568,JP,ja,Does being the only guy in an all girl school ...,163.327,/rWEykM1thLhYee6mK25fGLXyO14.jpg,Studio Signal Club,Japan,1992-08-21,,43,日本語,Released,,Delinquent in Drag,4.0,6
33,,,Mystery|Drama|Romance|Thriller,153104,CA|US,en,Holden returns home from college and is surpri...,145.307,/A6A15cDR8mpdOxW4y7UzFanzbM0.jpg,Hearst Entertainment Productions|JB Media|Ince...,Canada,2003-03-01,,93,English,Released,,Wicked Minds,6.6,37
48,,,Comedy,151586,IT,it,"The summer vacation of a young beautiful girl,...",131.329,/cpGdO3aGbDDxJICQ2vD5bUfb3qO.jpg,Lido,Italy,1974-06-14,,92,Italiano,Released,,La nipote,5.6,14
50,,,Animation|Science Fiction|Comedy,68828,JP,ja,In the midst of a violent invasion of Earth by...,128.098,/tKiIq3RgFOGMiNTj53wa2M68L7L.jpg,Tatsunoko Production|Anime International Company,Japan,1986-12-16,,51,日本語,Released,,Outlanders,4.9,14
56,,,Animation|Documentary,426889,CA,fr,"Women are lucky, they get to have the only org...",124.901,/p5i1yyQW6OpLfB59w7ZkX5lvBqD.jpg,Mel Hoppenheim School of Cinema,Canada,2016-06-18,,3,Français,Released,,Le clitoris,6.1,11
75,,,Mystery|Thriller|Romance,460229,US,en,A reckless night of indiscretion and lust lead...,111.773,/5QdNwga65k5iCE7fsghSdBmyVld.jpg,Retromedia Entertainment,United States of America,2017-08-08,,81,English,Released,,Kiss and Kill,5.474,57
90,,,Drama|Romance,46848,FR,fr,Three stories. A solitary sailor falls from hi...,107.065,/qHm5qK7bNyK0EgwE2qHps38yNQc.jpg,Les Films du Jeudi|FFF-French Movies|Toei Cent...,France|Japan,1979-06-27,,103,English|Français|日本語,Released,,Private Collections,6.05,20
93,,,Drama|Horror|Fantasy|Comedy,197158,BR,pt,Three tales of the erotic: Two young ladies ex...,106.436,/bhbPSpYjsRJAK7rXqL8APK9EwBl.jpg,Dacar Produções Cinematográficas,Brazil,1981-01-01,,83,Português,Released,,Porno,4.98,75
96,,,Thriller,108450,IT,it,"Mark, a hitman haunted by his past as a corrup...",105.256,/5nXdKIlB0G2Nv19Tx5ZUZU2fixI.jpg,Dania Film|National Cinematografica,Italy,1992-05-23,,86,Italiano,Released,,The Smile of the Fox,5.6,38


In [39]:
movies_complete.dropna(thresh=3, inplace=True)

In [40]:
movies_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  108 non-null    object        
 1   budget                 158 non-null    float64       
 2   genres                 180 non-null    object        
 3   id                     180 non-null    int64         
 4   origin_country         180 non-null    object        
 5   original_language      180 non-null    object        
 6   overview               180 non-null    object        
 7   popularity             180 non-null    float64       
 8   poster_path            180 non-null    object        
 9   production_companies   180 non-null    object        
 10  production_countries   180 non-null    object        
 11  release_date           180 non-null    datetime64[ns]
 12  revenue                161 non-null    float64       
 13  runti

## Final (Cleaning) Steps

__We Keep only those rows/movies in the df with status "Released".__

In [41]:
movies_complete = movies_complete[movies_complete['status']=='Released']

In [42]:
columns = ["id", "title", "tagline", "release_date", "genres", "belongs_to_collection", 
"original_language", 'origin_country', "budget", "revenue", "production_companies",
"production_countries", "vote_count", "vote_average", "popularity", "runtime",
"overview", "spoken_languages", "poster_path"]

In [43]:
movies_complete = movies_complete[columns]
movies_complete

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,origin_country,budget,revenue,production_companies,production_countries,vote_count,vote_average,popularity,runtime,overview,spoken_languages,poster_path
0,259872,Skin. Like. Sun.,,2010-02-19,Romance,,nl,NL,,,Blue Artichoke Films,Netherlands,31,6.500,394.968,60,Real-life young couple Wim and Floor spend an ...,No Language,/uCkANtG6ezb7hjRKVudY3PUcbvn.jpg
1,411,"The Chronicles of Narnia: The Lion, the Witch ...",The beloved masterpiece comes to life.,2005-12-07,Adventure|Family|Fantasy,The Chronicles of Narnia Collection,en,GB|US,180.0,745.013115,Walt Disney Pictures|Walden Media|C.S. Lewis C...,United Kingdom|United States of America,10727,7.134,359.436,143,"Siblings Lucy, Edmund, Susan and Peter step th...",English|Deutsch,/iREd0rNCjYdf5Ar0vfaW32yrkm.jpg
2,157336,Interstellar,Mankind was born on Earth. It was never meant ...,2014-11-05,Adventure|Drama|Science Fiction,,en,US,165.0,701.729206,Legendary Pictures|Syncopy|Lynda Obst Productions,United Kingdom|United States of America,36242,8.448,338.901,169,The adventures of a group of explorers who mak...,English,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg
3,7451,xXx,A new breed of secret agent.,2002-08-09,Action|Adventure|Thriller|Crime,xXx Collection,en,US,70.0,277.448382,Columbia Pictures|Original Film|Revolution Stu...,United States of America|Czech Republic,4323,5.937,304.922,124,Xander Cage is your standard adrenaline junkie...,Český|English|Deutsch|Español|Pусский,/xeEw3eLeSFmJgXZzmF2Efww0q3s.jpg
4,11845,Intimacy,Every Wednesday. She meets him once per week.,2001-01-20,Romance|Drama|Fantasy,,en,FR,,2.672527,ARTE France Cinéma|Téléma Productions|StudioCa...,France|Germany|United Kingdom,509,6.300,244.899,119,Failed musician Jay abandoned his family and n...,English,/I0mlv1FahaLJqu2ByffMXEn4NN.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,9502,Kung Fu Panda,Prepare for awesomeness.,2008-06-04,Action|Adventure|Animation|Family|Comedy,Kung Fu Panda Collection,en,US,130.0,632.091832,DreamWorks Animation,United States of America,11704,7.295,86.445,90,"When the Valley of Peace is threatened, lazy P...",English,/wWt4JYXTg5Wr3xBW2phBrMKgp3x.jpg
176,24,Kill Bill: Vol. 1,A roaring rampage of revenge.,2003-10-10,Action|Crime,Kill Bill Collection,en,US,30.0,180.906076,Miramax|A Band Apart|Super Cool ManChu,United States of America,17539,8.000,86.341,111,"An assassin is shot by her ruthless employer, ...",English|日本語|Français,/v7TaX8kXMXs5yFFGR41guUDNcnB.jpg
177,45243,The Hangover Part II,The Wolfpack Is Back,2011-05-25,Comedy,The Hangover Collection,en,US,80.0,586.764305,Legendary Pictures|Green Hat Films|Warner Bros...,United States of America,10529,6.500,86.322,102,The Hangover crew heads to Thailand for Stu's ...,English|ภาษาไทย,/7sGkjqorTHkaHTz8Q4WWHj8JL9t.jpg
178,640,Catch Me If You Can,The true story of a real fake.,2002-12-16,Drama|Crime,,en,US,52.0,352.114312,Parkes/MacDonald Productions|Kemp Company|Sple...,United States of America,15785,7.979,86.197,141,"A true story about Frank Abagnale Jr. who, bef...",English|Français,/sdYgEkKCDPWNU6KnoL4qd8xZ4w7.jpg


In [44]:
movies_complete.reset_index(drop=True, inplace=True)

In [45]:
movies_complete.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     180 non-null    int64         
 1   title                  180 non-null    object        
 2   tagline                162 non-null    object        
 3   release_date           180 non-null    datetime64[ns]
 4   genres                 180 non-null    object        
 5   belongs_to_collection  108 non-null    object        
 6   original_language      180 non-null    object        
 7   origin_country         180 non-null    object        
 8   budget                 158 non-null    float64       
 9   revenue                161 non-null    float64       
 10  production_companies   180 non-null    object        
 11  production_countries   180 non-null    object        
 12  vote_count             180 non-null    int64         
 13  vote_

In [46]:
basic_url = 'http://image.tmdb.org/t/p/w185/'
movies_complete['poster_path'] = "<img src='" + basic_url + movies_complete['poster_path'] + "' style='height:100px;'>"
movies_complete.poster_path.to_frame()

Unnamed: 0,poster_path
0,<img src='http://image.tmdb.org/t/p/w185//uCkA...
1,<img src='http://image.tmdb.org/t/p/w185//iREd...
2,<img src='http://image.tmdb.org/t/p/w185//gEU2...
3,<img src='http://image.tmdb.org/t/p/w185//xeEw...
4,<img src='http://image.tmdb.org/t/p/w185//I0ml...
...,...
175,<img src='http://image.tmdb.org/t/p/w185//wWt4...
176,<img src='http://image.tmdb.org/t/p/w185//v7Ta...
177,<img src='http://image.tmdb.org/t/p/w185//7sGk...
178,<img src='http://image.tmdb.org/t/p/w185//sdYg...


In [47]:
movies_complete.to_csv('movies_clean.csv', index=False)

In [48]:
pd.read_csv('movies_clean.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     180 non-null    int64  
 1   title                  180 non-null    object 
 2   tagline                162 non-null    object 
 3   release_date           180 non-null    object 
 4   genres                 180 non-null    object 
 5   belongs_to_collection  108 non-null    object 
 6   original_language      180 non-null    object 
 7   origin_country         180 non-null    object 
 8   budget                 158 non-null    float64
 9   revenue                161 non-null    float64
 10  production_companies   179 non-null    object 
 11  production_countries   180 non-null    object 
 12  vote_count             180 non-null    int64  
 13  vote_average           180 non-null    float64
 14  popularity             180 non-null    float64
 15  runtim

In [52]:
from IPython.display import HTML

HTML(pd.read_csv('movies_clean.csv')[['title', 'poster_path']].head(1).to_html(escape=False))

Unnamed: 0,title,poster_path
0,Skin. Like. Sun.,
