In [49]:
import pandas as pd
import numpy as np
from pprint import pprint

In [50]:
df = pd.read_csv('data.csv')
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)
df

Unnamed: 0,IMDB ID,Title,Director,Actor,Year,Box Office,Rating,IMDB Rating,IMDB Votes,Metascore
0,tt1375666,Inception,Christopher Nolan,Leonardo DiCaprio,2010,"$292,568,851",PG-13,8.8,1953512,74.0
1,tt1345836,The Dark Knight Rises,Christopher Nolan,Christian Bale,2012,"$448,130,642",PG-13,8.4,1454342,78.0
2,tt0816692,Interstellar,Christopher Nolan,Ellen Burstyn,2014,"$158,737,441",PG-13,8.6,1395990,74.0
3,tt1853728,Django Unchained,Quentin Tarantino,Jamie Foxx,2012,"$162,804,648",R,8.4,1282957,81.0
4,tt0848228,The Avengers,Joss Whedon,Robert Downey Jr.,2012,"$623,279,547",PG-13,8.0,1225316,69.0
...,...,...,...,...,...,...,...,...,...,...
640,tt2294629,Frozen,Chris Buck,Kristen Bell,2013,"$400,736,600",PG,7.5,551637,75.0
641,tt1591479,Act of Valor,Mike McCoy,Rorke Denver,2012,"$65,800,000",R,6.5,64922,40.0
642,tt1473832,Bridget Jones's Baby,Sharon Maguire,Renée Zellweger,2016,"$24,089,465",R,6.5,64988,59.0
643,tt1763303,The First Time,Jonathan Kasdan,Britt Robertson,2012,"$17,061",PG-13,6.8,64441,55.0


In [51]:
df.dtypes

IMDB ID         object
Title           object
Director        object
Actor           object
Year            object
Box Office      object
Rating          object
IMDB Rating    float64
IMDB Votes      object
Metascore      float64
dtype: object

## Things to Cleanup
* Year: remove hyphenated years
* Box Office: remove dollar signs and commas
* IMDB Votes: remove commas

## Year Cleanup

### Update data type for Year col

In [52]:
df = df.astype({'Year': int})
df.dtypes

IMDB ID         object
Title           object
Director        object
Actor           object
Year             int32
Box Office      object
Rating          object
IMDB Rating    float64
IMDB Votes      object
Metascore      float64
dtype: object

### Find rows outside of 2010s range

In [53]:
index_list = []

for index, row in df.iterrows():
    year = row.Year
    if year < 2010 or year > 2019:
        index_list.append(index)
print(index_list)

[176, 204, 262, 344, 610, 612, 637]


In [54]:
df.drop(index_list, inplace=True)
df.head()

Unnamed: 0,IMDB ID,Title,Director,Actor,Year,Box Office,Rating,IMDB Rating,IMDB Votes,Metascore
0,tt1375666,Inception,Christopher Nolan,Leonardo DiCaprio,2010,"$292,568,851",PG-13,8.8,1953512,74.0
1,tt1345836,The Dark Knight Rises,Christopher Nolan,Christian Bale,2012,"$448,130,642",PG-13,8.4,1454342,78.0
2,tt0816692,Interstellar,Christopher Nolan,Ellen Burstyn,2014,"$158,737,441",PG-13,8.6,1395990,74.0
3,tt1853728,Django Unchained,Quentin Tarantino,Jamie Foxx,2012,"$162,804,648",R,8.4,1282957,81.0
4,tt0848228,The Avengers,Joss Whedon,Robert Downey Jr.,2012,"$623,279,547",PG-13,8.0,1225316,69.0


## Box Office Cleanup

In [55]:
df['Box Office'] = df['Box Office'].apply(lambda x: float(x.translate({ord('$'): None, ord(','): None})))
df.head()

Unnamed: 0,IMDB ID,Title,Director,Actor,Year,Box Office,Rating,IMDB Rating,IMDB Votes,Metascore
0,tt1375666,Inception,Christopher Nolan,Leonardo DiCaprio,2010,292568851.0,PG-13,8.8,1953512,74.0
1,tt1345836,The Dark Knight Rises,Christopher Nolan,Christian Bale,2012,448130642.0,PG-13,8.4,1454342,78.0
2,tt0816692,Interstellar,Christopher Nolan,Ellen Burstyn,2014,158737441.0,PG-13,8.6,1395990,74.0
3,tt1853728,Django Unchained,Quentin Tarantino,Jamie Foxx,2012,162804648.0,R,8.4,1282957,81.0
4,tt0848228,The Avengers,Joss Whedon,Robert Downey Jr.,2012,623279547.0,PG-13,8.0,1225316,69.0


## IMDB Votes Cleanup

In [56]:
df['IMDB Votes'] = df['IMDB Votes'].apply(lambda x: float(x.translate({ord(','): None})))
df.head()

Unnamed: 0,IMDB ID,Title,Director,Actor,Year,Box Office,Rating,IMDB Rating,IMDB Votes,Metascore
0,tt1375666,Inception,Christopher Nolan,Leonardo DiCaprio,2010,292568851.0,PG-13,8.8,1953512.0,74.0
1,tt1345836,The Dark Knight Rises,Christopher Nolan,Christian Bale,2012,448130642.0,PG-13,8.4,1454342.0,78.0
2,tt0816692,Interstellar,Christopher Nolan,Ellen Burstyn,2014,158737441.0,PG-13,8.6,1395990.0,74.0
3,tt1853728,Django Unchained,Quentin Tarantino,Jamie Foxx,2012,162804648.0,R,8.4,1282957.0,81.0
4,tt0848228,The Avengers,Joss Whedon,Robert Downey Jr.,2012,623279547.0,PG-13,8.0,1225316.0,69.0


## Verify data types

In [57]:
df.dtypes

IMDB ID         object
Title           object
Director        object
Actor           object
Year             int32
Box Office     float64
Rating          object
IMDB Rating    float64
IMDB Votes     float64
Metascore      float64
dtype: object