In [3]:
pip install ftfy

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m313.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import ftfy
from scipy import stats

In [5]:
def load_data(filename):
    """Given a filename of a csv load data into a Pandas dataframe.

        filename - string

        return Pandas dataframe
    """
    return pd.read_csv(filename, encoding='utf-8')

In [6]:
def remove_unnecessary_columns(imdb):
    return imdb.drop('budget', axis=1, inplace=True)

In [7]:
def null_val(imdb):
    return imdb.isnull().sum()

In [8]:
def fill_missing_values(imdb):
    mean_value=imdb['gross'].mean()
    return imdb['gross'].fillna(value=mean_value, inplace=True)

In [9]:
def update_country_names(imdb):
    imdb["Country"]= imdb["Country"].str.upper()
    imdb['Country'] = np.where((imdb.Country == 'UNITED STATES'),'USA',imdb.Country)
    return 
    

In [10]:
def fix_director_values(imdb):
    imdb.director_name = imdb.director_name.fillna('', inplace=True)
    return

In [11]:
def fix_unicode_movie_title(imdb):
    imdb["movie_title"] = ftfy.fix_text(imdb["Les Mis√©rables"])
   

In [12]:
def fix_outliers(imdb):
    Z=np.abs(stats.zscore(imdb['imdb_score']))
    threshold = 3 
    # Position of the outlier
    return (np.where(Z > 3) &imdb.drop(imdb[imdb['title_year'] < 2010].index, inplace = true))

In [18]:
def main():
    """Clean up the imdb dataset
    """
    # 3. Rename filename.csv to the data filename.
    imdb = load_data('imdb.csv')
    imdb.dtypes
    #Attributes and types are given below for action 2.
    """color                    object
       director_name            object
       duration                  int64
       gross                   float64
       genres                   object
       movie_title              object
       title_year                int64
       language                 object
       country                  object
       budget                  float64
       imdb_score              float64
       actors                   object
       movie_facebook_likes      int64
       dtype: object"""
    # 4. Using Pandas dataframe drop function get rid of unnecessary columns (set
    # inplace=True)
    # **5. How many columns did you remove?** We removed ONE column - budget because it was highly correlative to gross attribute
    remove_unnecersary_columns(imdb)

    # 6. How many missing values are there within each column?**
    # hint: use isnull and the sum function
    null_val(imdb)

     
    fill_missing_values(imdb)

    # 9. Uppercase all of the country values (hint: str.upper())
    # 9a. replace any reference to United States to USA
    update_country_names(imdb)
    imdb["Country"]= imdb["Country"].str.upper()

    # 10. Replace N/A, Nan, Null with an empty string
    imdb = fix_director_values(imdb)

    # 11. Fix unicode in 'movie_title' column with import ftfy
    imdb = fix_unicode_movie_title(imdb)

    # 12. Assume a movie cannot be < 10 mins or > 300 mins. If a movie is outside those
    # bounds set the value to 0.
    imdb['duration']=imdb['duration'].where(imdb['duration'] <10, 0)

    imdb['duration']=imdb['duration'].where(imdb['duration'] >300, 0)
    
    # **13. What would be considered an outlier for imdb_score?**
    """We calculated z score
       In z score, any value which is greater than 3 is considered as an outlier. 
       Because 99.6% of the data is under z-score 3. Therefore anything greater than z score 3 is an outlier"""
    # 14. Fix imdb_score and title_year (no year prior to 2010) outliers.
    imdb = fix_outliers(imdb)

    # 15. output the cleaned up file onto a new csv called clean_imdb.csv
    imdb.to_csv('clean_imdb.csv',index=False)
    