# Loading the datasets

In [1]:
# Importing the required libraries
import pandas as pd
import re
import os
from ETL_Functions.etl_functions import load_csv, concat_df, etl

In [2]:
# Get the list of all files in a common directory
path = '../PI01-Data-Engineering/Datasets/'
dir_list = os.listdir(path)

In [3]:
print(f'Files stored in {path}:')
for files in dir_list:  # print all files
    print(files)

Files stored in ../PI01-Data-Engineering/Datasets/:
README.md
disney_plus_titles-score.csv
amazon_prime_titles-score.csv
netflix_titles-score.csv
hulu_titles-score (2).csv


In [4]:
# Load the csv files
for file in dir_list:
    if re.search(r'.csv', file):
        load_csv(path, file)

Succesfull load of disney_plus_titles-score.csv
Succesfull load of amazon_prime_titles-score.csv
Succesfull load of netflix_titles-score.csv
Succesfull load of hulu_titles-score (2).csv


# Transform phase

## 1.- Generating `id`

In [5]:
# Transform phase
path_ETL = '../PI01-Data-Engineering/Datasets_for_ETL/'
dir_list_ETL = os.listdir(path_ETL)

In [6]:
print(f'Files stored in {path_ETL}:')
for files_etl in dir_list_ETL:  # print all files
    print(files_etl)

Files stored in ../PI01-Data-Engineering/Datasets_for_ETL/:
df_amazon_prime_titles-score.csv
df_hulu_titles-score (2).csv
df_netflix_titles-score.csv
df_disney_plus_titles-score.csv


In [7]:
etl(path_ETL, files_etl)

In [8]:
amazon = pd.read_csv('../PI01-Data-Engineering/PI_01/amazon.csv')

In [9]:
amazon.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
0,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,"march 30, 2021",2014,g,113 min,"comedy, drama",a small fishing village must procure a local d...,99,as1,2021-03-30 00:00:00,113,min
1,s2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,"march 30, 2021",2018,13+,110 min,"drama, international",a metro family decides to fight a cyber crimin...,37,as2,2021-03-30 00:00:00,110,min
2,s3,movie,secrets of deception,josh webber,"tom sizemore, lorenzo lamas, robert lasardo, r...",united states,"march 30, 2021",2017,g,74 min,"action, drama, suspense",after a man discovers his wife is cheating on ...,20,as3,2021-03-30 00:00:00,74,min


In [10]:
netflix_df = pd.read_csv('../PI01-Data-Engineering/PI_01/netflix.csv')

In [11]:
# 1.- get_word_count('netflix', 'love')
netflix = netflix_df['title'].str.count('love').sum()

In [12]:
print(type(netflix))
print(netflix)

<class 'numpy.int64'>
198


In [13]:
res = dict(platform='netflix', cantidad=str(netflix))

In [14]:
res

{'platform': 'netflix', 'cantidad': '198'}

In [15]:
platform = 'hulu'
if platform in ['netflix', 'hulu', 'disney', 'amazon']:
    print(f'{platform} is on the house')
    print(platform+'.csv')
else:
    print(f'{platform} is not in the house')

hulu is on the house
hulu.csv


In [16]:
# 2.- get_score_count('netflix', 85, 2010)
print(len(netflix_df[(netflix_df['score'] > 85) & (netflix_df['release_year'] == 2010)]))

25


In [17]:
# 3.- get_second_score('amazon')
amazon_df = pd.read_csv('../PI01-Data-Engineering/PI_01/amazon.csv')

In [18]:
amazon_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
0,s1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,"march 30, 2021",2014,g,113 min,"comedy, drama",a small fishing village must procure a local d...,99,as1,2021-03-30 00:00:00,113,min
1,s2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,"march 30, 2021",2018,13+,110 min,"drama, international",a metro family decides to fight a cyber crimin...,37,as2,2021-03-30 00:00:00,110,min


In [19]:
amazon.sort_values(by=['score'], ascending=False).head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
1133,s1134,tv show,"ronja, the robber's daughter",0,"gillian anderson, theresa gallagher",0,0,2017,7+,1 season,"anime, kids",presented by studio ghibli. the daughter of a ...,100,as1134,0,1,season
9527,s9528,movie,the lazarus effect,david gelb,"mark duplass, olivia wilde, sarah bolger, evan...",0,0,2015,pg-13,83 min,"horror, science fiction, suspense",a group of medical researchers discover a way ...,100,as9528,0,83,min


In [20]:
amazon.sort_values(by=['title'], ascending=True).head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
5547,s5548,movie,"""mixed up""",nishi chawla,"uday krishna, bethany rishell, lucy bond, sanj...",0,0,2020,16+,106 min,"drama, romance","""mixed up"" examines casual factors that make u...",30,as5548,0,106,min
5977,s5978,tv show,"""the paramedic angel""",0,"nate reidnauer, nikki hrichak, nina randazzo, ...",0,0,2021,all,1 season,drama,the tragedy of a loving family man and paramed...,100,as5978,0,1,season


In [21]:
amazon.sort_values(by=['score', 'title'], ascending=[False, True]).head(6)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
5977,s5978,tv show,"""the paramedic angel""",0,"nate reidnauer, nikki hrichak, nina randazzo, ...",0,0,2021,all,1 season,drama,the tragedy of a loving family man and paramed...,100,as5978,0,1,season
3546,s3547,movie,15-minute cardio core 8.0 workout (with weights),0,maggie binkley,0,0,2019,all,20 min,fitness,circuit through cardio & core exercises in thi...,100,as3547,0,20,min
3501,s3502,tv show,2019 avp austin open - day 3,0,0,0,0,2019,tv-nr,1 season,tv shows,"fittingly, the avp’s second stop on the 2019 t...",100,as3502,0,1,season
3499,s3500,tv show,2019 avp hermosa beach open - day 2,0,0,0,0,2019,tv-nr,1 season,tv shows,"july 26th – 28th, 2019; hermosa beach is a cor...",100,as3500,0,1,season
6468,s6469,tv show,227,0,"marla gibbs, hal williams, alaina reed, jackée...",0,0,1990,tv-pg,5 seasons,comedy,"in a world gone ""condo,"" apartment building 22...",100,as6469,0,5,season
3366,s3367,movie,abilene town,edwin l. marin,"ann dvorak, randolph scott, edgar buchanan, rh...",0,0,1946,13+,89 min,western,"abilene, kansas, town marshal dan mitchell (sc...",100,as3367,0,89,min


In [22]:
amazon.sort_values(by=['score', 'title'], ascending=[False, True]).iloc[1]['title']

'15-minute cardio core 8.0 workout (with weights)'

In [23]:
amazon.sort_values(by=['score', 'title'], ascending=[False, True]).iloc[1]['score']

100

In [24]:
amazon['title_2'] = amazon['title'].astype(str)

In [25]:
amazon.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'score', 'id', 'date', 'duration_int', 'duration_type', 'title_2'],
      dtype='object')

In [26]:
amazon_df.sort_values(by=['score', 'title'], ascending=[False,True]).head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
5977,s5978,tv show,"""the paramedic angel""",0,"nate reidnauer, nikki hrichak, nina randazzo, ...",0,0,2021,all,1 season,drama,the tragedy of a loving family man and paramed...,100,as5978,0,1,season
3546,s3547,movie,15-minute cardio core 8.0 workout (with weights),0,maggie binkley,0,0,2019,all,20 min,fitness,circuit through cardio & core exercises in thi...,100,as3547,0,20,min


In [28]:
amazon_df.sort_values(by=['title'], ascending=True).head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
5547,s5548,movie,"""mixed up""",nishi chawla,"uday krishna, bethany rishell, lucy bond, sanj...",0,0,2020,16+,106 min,"drama, romance","""mixed up"" examines casual factors that make u...",30,as5548,0,106,min
5977,s5978,tv show,"""the paramedic angel""",0,"nate reidnauer, nikki hrichak, nina randazzo, ...",0,0,2021,all,1 season,drama,the tragedy of a loving family man and paramed...,100,as5978,0,1,season


In [29]:
# 4.- get_longest('netflix', 'min', '2016')
netflix_df[(netflix_df['duration_type'] == 'min') & (netflix_df['release_year'] == 2016)].sort_values(by=['duration_int'], ascending=False)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
7922,s7923,movie,sairat,nagraj manjule,"rinku rajguru, akash thosar, arbaz shaikh, tan...",india,"april 1, 2018",2016,tv-14,173 min,"dramas, international movies, romantic movies",when two college students – a rich man’s daugh...,41,ns7923,2018-04-01 00:00:00,173,min
4730,s4731,movie,mohenjo daro,ashutosh gowariker,"hrithik roshan, sonakshi sinha, pooja hegde, k...",india,"august 2, 2018",2016,tv-14,168 min,"action & adventure, dramas, international movies",a courageous villager moves to the ancient cit...,53,ns4731,2018-08-02 00:00:00,168,min
7565,s7566,movie,natsamrat - asa nat hone nahi,mahesh manjrekar,"nana patekar, medha manjrekar, mrinmayee deshp...",india,"june 1, 2018",2016,tv-14,165 min,"dramas, international movies",a veteran shakespearean actor steps off the st...,60,ns7566,2018-06-01 00:00:00,165,min
6144,s6145,movie,american honey,andrea arnold,"sasha lane, shia labeouf, riley keough, mccaul...","united kingdom, united states","april 27, 2019",2016,r,163 min,"dramas, independent movies",a teenage girl leaves her dull life in oklahom...,87,ns6145,2019-04-27 00:00:00,163,min
5420,s5421,movie,dangal,nitesh tiwari,"aamir khan, sakshi tanwar, fatima sana shaikh,...",india,"june 21, 2017",2016,tv-pg,161 min,"dramas, international movies, sports movies",a once-promising wrestler pursues the gold med...,93,ns5421,2017-06-21 00:00:00,161,min
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4897,s4898,movie,pocoyo special sports,"guillermo garcia, david cantolla",0,0,"may 1, 2018",2016,tv-y,24 min,movies,everyone is getting excited for the games. poc...,4,ns4898,2018-05-01 00:00:00,24,min
7848,s7849,movie,refugee,"clementine malpas, leslie knott","cate blanchett, lynsey addario, omar victor di...",0,"march 10, 2017",2016,tv-pg,24 min,documentaries,five acclaimed photographers travel the world ...,66,ns7849,2017-03-10 00:00:00,24,min
5857,s5858,movie,kung fu panda: secrets of the scroll,rodolphe guenoden,"jack black, dustin hoffman, seth rogen, david ...",united states,"march 25, 2016",2016,tv-pg,23 min,"children & family movies, comedies",when a twist of fate brings five unlikely anim...,98,ns5858,2016-03-25 00:00:00,23,min
7313,s7314,movie,little lunch: the halloween horror story,tim bartley,"flynn curry, olivia deeble, madison lu, oisín ...",australia,"march 15, 2018",2016,tv-y7,23 min,"children & family movies, comedies",when the class puts on a halloween play about ...,49,ns7314,2018-03-15 00:00:00,23,min


In [30]:
netflix_df[(netflix_df['duration_type'] == 'min') & (netflix_df['release_year'] == 2016)].sort_values(by=['duration_int'], ascending=False).head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,id,date,duration_int,duration_type
7922,s7923,movie,sairat,nagraj manjule,"rinku rajguru, akash thosar, arbaz shaikh, tan...",india,"april 1, 2018",2016,tv-14,173 min,"dramas, international movies, romantic movies",when two college students – a rich man’s daugh...,41,ns7923,2018-04-01 00:00:00,173,min


In [31]:
netflix_df[(netflix_df['duration_type'] == 'min') & (netflix_df['release_year'] == 2016)].sort_values(by=['duration_int'], ascending=False)[['title', 'duration','duration_type']].head(1)

Unnamed: 0,title,duration,duration_type
7922,sairat,173 min,min


In [32]:
longest = netflix_df[(netflix_df['duration_type'] == 'min') & (netflix_df['release_year'] == 2016)].sort_values(by=['duration_int'], ascending=False)[['title', 'duration_int','duration_type']].head(1)


In [33]:
print(longest)

       title  duration_int duration_type
7922  sairat           173           min


In [34]:
print(longest.to_dict())

{'title': {7922: 'sairat'}, 'duration_int': {7922: 173}, 'duration_type': {7922: 'min'}}


In [35]:
# 5.- get_rating_count('18+')

# Concat all df dataframes
path_df_clean = '../PI01-Data-Engineering/PI_01/'
dir_list_clean = os.listdir(path_df_clean)

In [36]:
concat_df(path_df_clean, dir_list_clean)

Succesful load of amazon.csv
Succesful load of hulu.csv
Succesful load of disney.csv
Succesful load of netflix.csv


In [37]:
df_total = pd.read_csv('../PI01-Data-Engineering/PI_01/df_full.csv')

(df_total['rating'] == '18+').sum()

1243

In [38]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22998 entries, 0 to 22997
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   show_id        22998 non-null  object
 1   type           22998 non-null  object
 2   title          22998 non-null  object
 3   director       22998 non-null  object
 4   cast           22998 non-null  object
 5   country        22998 non-null  object
 6   date_added     22998 non-null  object
 7   release_year   22998 non-null  int64 
 8   rating         22998 non-null  object
 9   duration       22998 non-null  object
 10  listed_in      22998 non-null  object
 11  description    22998 non-null  object
 12  score          22998 non-null  int64 
 13  id             22998 non-null  object
 14  date           22998 non-null  object
 15  duration_int   22998 non-null  int64 
 16  duration_type  22998 non-null  object
dtypes: int64(3), object(14)
memory usage: 3.0+ MB
