In [1]:
import pandas as pd

df_amazon = pd.read_csv('cleaned_data/amazon_data_clean.csv')
df_netflix = pd.read_csv('cleaned_data/netflix_data_clean.csv')
df_hulu = pd.read_csv('cleaned_data/hulucleandata.csv')
df_disney = pd.read_csv('cleaned_data/disneypluscleandatatitles.csv')

In [2]:
df = pd.concat([df_amazon,df_netflix], axis=0, ignore_index=True)
df = pd.concat([df, df_hulu], axis=0, ignore_index=True)
df = pd.concat([df, df_disney], axis=0, ignore_index=True)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.dropna(subset='rating', inplace=True)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,listed_in,description,date_added
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,2018,13+,110.0,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,
1,s87,TV Show,Wipeout Canada,unknown,"Ennis Esmer, Jonathan Torrens, Jessica Phillips",unknown,2011,G,1.0,"Comedy, Documentary, Unscripted",Daring Canadian contestants are in pursuit of ...,
2,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",unknown,2016,13+,87.0,"Horror, Science Fiction",When a strange virus quickly spreads through a...,
3,s18,TV Show,Zoo Babies,unknown,Narrator - Gillian Barlett,unknown,2008,G,1.0,"Kids, Special Interest",A heart warming and inspiring series that welc...,
4,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,unknown,Zoë Coombs Marr,unknown,2020,18+,1.0,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22121 entries, 0 to 22126
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       22121 non-null  object 
 1   type          22121 non-null  object 
 2   title         22121 non-null  object 
 3   director      19568 non-null  object 
 4   cast          19379 non-null  object 
 5   country       21173 non-null  object 
 6   release_year  22121 non-null  int64  
 7   rating        22121 non-null  object 
 8   duration      21748 non-null  float64
 9   listed_in     22121 non-null  object 
 10  description   22120 non-null  object 
 11  date_added    12759 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 2.2+ MB


In [4]:
# fill duration nulls with the mode
df['duration'] = df['duration'].fillna(df['duration'].mode()[0])
df['duration'] = df['duration'].astype(int)
# Change values in director and cast to empty string
df['director'] = df['director'].fillna('unknown')
df['cast'] = df['cast'].fillna('unknown')

In [5]:
# define function to clean rating col
def clean_rating(string):
    try:
        if string == 'NOT RATED':
            return 'NR'
        elif 'min' in string or 'Season' in string:
            return 'unknown'
        return string
    except TypeError:
        return 'unknown'

In [6]:
df['rating'] = df['rating'].apply(clean_rating)
df['rating'].unique()

array(['13+', 'G', '18+', 'R', 'TV-Y', 'TV-Y7', 'NR', '16+', 'TV-PG',
       '7+', 'TV-14', 'TV-NR', 'TV-G', 'PG-13', 'TV-MA', 'PG', 'NC-17',
       'UR', 'TV-Y7-FV', 'unknown'], dtype=object)

In [7]:
# define function to get main genre
def get_genre(string):
    genre = string.split(',')[0]
    return genre
    

In [13]:
df['main_genre'] = df['listed_in'].apply(get_genre)
df['main_genre'].unique()

array(['Drama', 'Comedy', 'Horror', 'Kids', 'Science Fiction', 'Action',
       'Arts', 'TV Shows', 'Documentary', 'Animation', 'Anime', 'Fitness',
       'Music Videos and Concerts', 'Faith and Spirituality',
       'Special Interest', 'Adventure', 'Fantasy', 'Suspense',
       'Unscripted', 'Western', 'Arthouse', 'Sports', 'Military and War',
       'International', 'Romance', 'Young Adult Audience',
       'Talk Show and Variety', 'LGBTQ', 'Historical', 'Documentaries',
       'International TV Shows', 'Crime TV Shows', 'Docuseries',
       'TV Dramas', 'Children & Family Movies', 'Dramas',
       'British TV Shows', 'Comedies', 'TV Comedies', 'Thrillers',
       'Horror Movies', "Kids' TV", 'Action & Adventure', 'Reality TV',
       'Anime Series', 'International Movies', 'Sci-Fi & Fantasy',
       'Classic Movies', 'Stand-Up Comedy', 'TV Action & Adventure',
       'Movies', 'Stand-Up Comedy & Talk Shows', 'Classic & Cult TV',
       'Anime Features', 'Romantic TV Shows', 'Cult Mo

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22121 entries, 0 to 22126
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       22121 non-null  object
 1   type          22121 non-null  object
 2   title         22121 non-null  object
 3   director      22121 non-null  object
 4   cast          22121 non-null  object
 5   country       21173 non-null  object
 6   release_year  22121 non-null  int64 
 7   rating        22121 non-null  object
 8   duration      22121 non-null  int32 
 9   listed_in     22121 non-null  object
 10  description   22120 non-null  object
 11  date_added    12759 non-null  object
 12  main_genre    22121 non-null  object
dtypes: int32(1), int64(1), object(11)
memory usage: 2.3+ MB


In [10]:
df = df.convert_dtypes()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22121 entries, 0 to 22126
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       22121 non-null  string
 1   type          22121 non-null  string
 2   title         22121 non-null  string
 3   director      22121 non-null  string
 4   cast          22121 non-null  string
 5   country       21173 non-null  string
 6   release_year  22121 non-null  Int64 
 7   rating        22121 non-null  string
 8   duration      22121 non-null  Int32 
 9   listed_in     22121 non-null  string
 10  description   22120 non-null  string
 11  date_added    12759 non-null  string
 12  main_genre    22121 non-null  string
dtypes: Int32(1), Int64(1), string(11)
memory usage: 2.3 MB


In [11]:
df['type'].unique()

<StringArray>
['Movie', 'TV Show']
Length: 2, dtype: string

In [12]:
df.to_csv('cleaned_data/df_merged_clean.csv', index=False)