# Data Acquisition

Let's start with imports: Pandas will help us handling DataFrames, and BeautifulSoup will help us with the parsing of the files.

In [None]:
import pandas as pd
import html
import requests
import bs4
from bs4 import BeautifulSoup
import scipy as sc
import numpy as np

The main function which perform the crawling and build list of the appropriate data

In [11]:
def extract_data_html(html):
    soup1 = BeautifulSoup(html,"html.parser")
    #if there is an error page ,we don't add this page to the list of data and return None
    error=soup1.find(class_="ipc-page-content-container ipc-page-content-container--center _error__ErrorPageContentContainer-sc-ql15x0-0 lkWKVH")
    if error is not None:
        data = None
        return data
    else:
        #search by rating 
        infor=soup1.find("span",attrs={"class":"AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV"})
        #if there is no rating in a specific series ,,we don't add this page to the list of data and return None
        if infor is None:
            data = None
            return data
        else:
            rating = (float)(infor.text)
        #search by name 
        name= soup1.find(class_="TitleHeader__TitleText-sc-1wu6n3d-0 dxSWFG")
        if name is None:
            nameSeries=""
        else:
            nameSeries= name.text
        #search by year of published 
        year=soup1.find(class_="TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex")
        if year is None:
            yearPublished =""
        else:
            yearPublished=year.text
        #search by length of chapter
        chapters=soup1.findAll(class_="ipc-metadata-list-item__content-container")
        if chapters is None:
            chapter = -1
        i=0
        j=0
        #change the data to integer value and convert it to minutes
        hoursInt=0
        minutesInt=0
        while i<(len(chapters)-1):
            j=0
            chip =chapters[i].text.split()
            while j< len(chip):
                if chip[j] == 'hour':
                    if chip[j-1].isalpha() == False:
                        hours = chip[j-1][0]
                        if hours.isnumeric() == True:
                            hoursInt = (int)(hours)
                        else:
                            hoursInt = 0
                if chip[j] == 'minutes':
                    if chip[j-1].isnumeric() == True:
                        if len(chip[j-1])>2:
                            minutesInt = 0
                        else:
                            minutes = chip[j-1]
                            minutesInt = (int)(minutes)    
                j= j+1
            i=i+1
        total=hoursInt*60 + minutesInt
        chapter=total
        #search by Genre
        genre=soup1.findAll(class_="ipc-chip__text")
        genreName=genre[0].text
        if genre is None:
            genreName=""
        #search by Creator    
        creator=soup1.find(class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
        if creator is None:
            creator=""
        else:
            creator=creator.text
        #search by Top cast 
        topCast=soup1.findAll(class_="StyledComponents__ActorName-sc-y9ygcu-1 ezTgkS")
        if topCast is None:
            listTopCast=""
        else:
            i=0
            listTopCast=[]
            while i < len(topCast):
                listTopCast.append(topCast[i].text)
                i= i+1
        #search by Reviews and divide it to user Reviews and critic Reviews
        reviews=soup1.findAll(class_="score")
        userReviews=-1
        criticReviews=-1
        if len(reviews)<1:
            userReviews=-1
            criticReviews=-1
        else:
            if reviews[0] is None:
                userReviews=-1
            else:
                if reviews[0].text.isnumeric() == False:
                    userReviews=reviews[0].text.split('K')   
                    userReviews=(int)((float)(userReviews[0])*1000)
                else:
                    userReviews=(int)(reviews[0].text)
            if len(reviews)==1:
                criticReviews=-1
            else:
                if len(reviews[1])!=0:
                    if reviews[1].text.isnumeric() == False:
                        criticReviews1=reviews[1].text.split('K')
                        if criticReviews1[0].isnumeric() == True:
                            criticReviews=(int)((float)(criticReviews1[0])*1000)
                    else:
                        criticReviews=(int)(reviews[1].text)
                else:
                     criticReviews=-1
        #search by Views               
        view=soup1.find(class_="AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ")
        views=-1
        if view is None:
            views=-1
        else:
            if view.text.isnumeric() == False:
                views1=view.text.split('K')
                views2=view.text.split('M')
                if views1[0] is not None:
                    if views1[0].isnumeric() == True:
                        views=(int)((float)(views1[0])*1000)
                    elif views2[0].isnumeric() == True:
                        views=(int)((float)(views2[0])*1000000)
            else:
                views=(int)(view.text)
        #all these data enter to this list
        data = [nameSeries, creator, genreName, yearPublished, chapter, listTopCast, userReviews, criticReviews, views, rating]
        return data


function that convert the DataFrame into csv file and save it

In [12]:
def save_csv(df,name):
    df.to_csv(name,encoding='iso8859-8',errors='replace')

function that build the DataFrame - by Genre
 - using IMDB Web

In [13]:
def load_csv_by_genre(genreName):
    columns = ['SeriesName', 'Creator', 'Genre', 'YearOfPublished', 'ChapterLengthInMiniutes', 'TopCast', 'UserReviews','CriticReviews', 'Views', 'Rating', 'SeriesId']
    count = 0
    nextPage=1
    #Url of the main page of IMDb 
    url_template2= "https://www.imdb.com/{}"
    #Url of genre page in IMDb 
    url_template3= "https://www.imdb.com/search/title/?genres={}&start={}&explore=title_type,genres&ref_=adv_nxt"
    series_data_frame = pd.DataFrame(columns=columns)
    while count<25:
        link_to_genre_page =[]
        r3=requests.get(url_template3.format(genreName,nextPage))
        soup3 =  BeautifulSoup(r3.content)
        for s in soup3.findAll("div",attrs={"class":"lister-item mode-advanced"}):
            link_to_genre_page.append(s.find('a')['href'])
        for url in link_to_genre_page:
            formath=url.split("tt")
            num=formath[1].split("/")
            seriesId= num[0]
            r2=requests.get(url_template2.format(url))
            data = extract_data_html(r2.content) 
            if data is not None:
                if seriesId is not None:
                    data.append(seriesId)
                else:
                    seriesId =0
                d = {col: data[i] for i, col in enumerate(columns)}
                series_data_frame = series_data_frame.append(d, ignore_index=True)
        count = count + 1 
        nextPage = nextPage + 50
    return series_data_frame


- The list of the genres 

In [None]:
listGenres=['comedy','sci-fi','romance','action','Thriller','drama','mystery','crime','animation','adventure','fantasy','horror']
template_csv= "{}.csv"

call the function by each genre from the list and save the results - each genre has a dataFrame
- dfGenre0 - dfGenre11

In [10]:
dfGenre0 = load_csv_by_genre(listGenres[0])
save_csv(dfGenre0,template_csv.format(listGenres[0]))
dfGenre0


Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",335,21,22000,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102,"[Stephanie Beatriz, María Cecilia Botero, John...",827,192,113000,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019–2022,30,"[Ricky Gervais, Tom Basden, Tony Way, Diane Mo...",2000,70,112000,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18,"[Leonardo DiCaprio, Jennifer Lawrence, Meryl S...",4100,260,388000,7.3,11286314
4,How I Met Your Father,Isaac Aptaker,Comedy,2022â€“,24,"[Hilary Duff, Christopher Lowell, Francia Rais...",301,1,-1,5.1,14500082
...,...,...,...,...,...,...,...,...,...,...,...
1195,August: Osage County,John Wells,Comedy,2013,0,"[Meryl Streep, Dermot Mulroney, Julia Roberts,...",333,314,89000,7.2,1322269
1196,Like Mike,John Schultz,Comedy,2002,99,"[Shad Moss, Jonathan Lipnicki, Morris Chestnut...",76,34,20000,5.3,0308506
1197,The Jeffersons,Norman Lear,Comedy,1975–1985,0,"[Isabel Sanford, Sherman Hemsley, Roxie Roker,...",55,25,-1,7.4,0072519
1198,The Commitments,Alan Parker,Comedy,1991,118,"[Robert Arkins, Michael Aherne, Angeline Ball,...",127,62,35000,7.6,0101605


In [11]:
dfGenre1 = load_csv_by_genre(listGenres[1])
save_csv(dfGenre1,template_csv.format(listGenres[1]))
dfGenre1

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Eternals,Chloé Zhao,Action,2021,36,"[Gemma Chan, Richard Madden, Angelina Jolie, S...",3000,314,224000,6.5,9032400
1,Archive 81,Rebecca Sonnenshine,Drama,2022–,60,"[Mamoudou Athie, Dina Shihabi, Evan Jonigkeit,...",402,40,18000,7.5,13365348
2,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",337,21,23000,8.4,13146488
3,The Book of Boba Fett,Jon Favreau,Action,2021–,38,"[Temuera Morrison, Ming-Na Wen, Matt Berry, Fr...",729,2,36000,7.7,13668894
4,,Jon Watts,Action,2021,28,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",4800,337,421000,8.7,10872600
...,...,...,...,...,...,...,...,...,...,...,...
1161,,Stephen Chiodo,Comedy,1988,88,"[Grant Cramer, Suzanne Snyder, John Allen Nels...",313,175,36000,6.2,0095444
1162,Ben 10: Omniverse,Man of Action,Animation,2012–2014,22,"[Yuri Lowenthal, Bumper Robinson, Eric Bauza, ...",76,3,-1,6.1,2293002
1163,The Nutty Professor,Tom Shadyac,Comedy,1996,95,"[Eddie Murphy, Jada Pinkett Smith, James Cobur...",139,50,115000,5.6,0117218
1164,Species II,Peter Medak,Action,1998,93,"[Natasha Henstridge, Michael Madsen, Marg Helg...",172,107,29000,4.4,0120841


In [12]:
dfGenre2 = load_csv_by_genre(listGenres[2])
save_csv(dfGenre2,template_csv.format(listGenres[2]))
dfGenre2

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Brazen,Monika Mitchell,Crime,2022,94,"[Alyssa Milano, Sam Page, Malachi Weir, Emilie...",313,41,-1,3.9,13978306
1,The Power of the Dog,Jane Campion,Drama,2021,6,"[Benedict Cumberbatch, Kirsten Dunst, Jesse Pl...",955,256,87000,7.0,10293406
2,The King's Daughter,Sean McNamara,Action,2022,90,"[Pierce Brosnan, Kaya Scodelario, William Hurt...",26,24,-1,5.1,2328678
3,Licorice Pizza,Paul Thomas Anderson,Comedy,2021,13,"[Alana Haim, Cooper Hoffman, Sean Penn, Tom Wa...",215,163,20000,7.9,11271038
4,A Discovery of Witches,Matthew Goode,Drama,2018–2022,60,"[Matthew Goode, Teresa Palmer, Alex Kingston, ...",784,30,46000,8.0,2177461
...,...,...,...,...,...,...,...,...,...,...,...
1216,Somewhere in Time,Jeannot Szwarc,Drama,1980,103,"[Christopher Reeve, Jane Seymour, Christopher ...",315,48,29000,7.2,0081534
1217,The Space Between Us,Peter Chelsom,Drama,2017,0,"[Gary Oldman, Asa Butterfield, Carla Gugino, B...",253,155,52000,6.4,3922818
1218,The King and I,Walter Lang,Biography,1956,13,"[Yul Brynner, Deborah Kerr, Rita Moreno, Marti...",111,39,25000,7.4,0049408
1219,Été 85,François Ozon,Drama,2020,101,"[Félix Lefebvre, Benjamin Voisin, Philippine V...",47,107,-1,6.9,10457128


In [7]:
dfGenre3 = load_csv_by_genre(listGenres[3])
save_csv(dfGenre3,template_csv.format(listGenres[3]))
dfGenre3

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Eternals,Chloé Zhao,Action,2021,36,"[Gemma Chan, Richard Madden, Angelina Jolie, S...",3000,315,224000,6.5,9032400
1,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",348,21,23000,8.4,13146488
2,The Book of Boba Fett,Jon Favreau,Action,2021–,38,"[Temuera Morrison, Ming-Na Wen, Matt Berry, Fr...",738,2,37000,7.7,13668894
3,,Jon Watts,Action,2021,28,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",4800,338,422000,8.7,10872600
4,The Witcher,Lauren Schmidt,Action,2019–,60,"[Henry Cavill, Freya Allan, Anya Chalotra, Mim...",5200,127,443000,8.2,5180504
...,...,...,...,...,...,...,...,...,...,...,...
1163,RoboCop,José Padilha,Action,2014,117,"[Joel Kinnaman, Gary Oldman, Michael Keaton, A...",739,519,224000,6.1,1234721
1164,,Nicholas Meyer,Action,1982,113,"[William Shatner, Leonard Nimoy, DeForest Kell...",427,123,118000,7.7,0084726
1165,,Mark Neveldine,Action,2011,96,"[Nicolas Cage, Ciarán Hinds, Idris Elba, Viola...",392,291,117000,4.3,1071875
1166,The VelociPastor,Brendan Steere,Action,2018,75,"[Greg Cohan, George Schewnzer, Janice Young, D...",191,57,-1,5.1,1843303


In [8]:
dfGenre4 = load_csv_by_genre(listGenres[4])
save_csv(dfGenre4,template_csv.format(listGenres[4]))
dfGenre4

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Archive 81,Rebecca Sonnenshine,Drama,2022–,60,"[Mamoudou Athie, Dina Shihabi, Evan Jonigkeit,...",412,41,19000,7.5,13365348
1,Scream,Matt Bettinelli-Olpin,Horror,2022,114,"[Neve Campbell, Courteney Cox, David Arquette,...",737,222,31000,7.1,11245972
2,Yellowjackets,Ashley Lyle,Drama,2021–,0,"[Melanie Lynskey, Tawny Cypress, Ella Purnell,...",234,24,22000,8.1,11041332
3,Ozark,Bill Dubuque,Crime,2017–2022,60,"[Jason Bateman, Laura Linney, Sofia Hublitz, S...",1500,109,239000,8.4,5071412
4,Brazen,Monika Mitchell,Crime,2022,94,"[Alyssa Milano, Sam Page, Malachi Weir, Emilie...",313,41,-1,3.9,13978306
...,...,...,...,...,...,...,...,...,...,...,...
1205,3%,Pedro Aguilera,Action,2016–2020,49,"[Bianca Comparato, Vaneza Oliveira, Rodolfo Va...",216,30,25000,7.3,4922804
1206,Secret City,Anna Torv,Mystery,2016–2019,49,"[Anna Torv, Marcus Graham, Justin Smith, Aleks...",100,7,-1,7.4,4976512
1207,Stoker,Park Chan-wook,Drama,2013,99,"[Mia Wasikowska, Nicole Kidman, Matthew Goode,...",309,441,107000,6.8,1682180
1208,Outbreak,Wolfgang Petersen,Action,1995,7,"[Dustin Hoffman, Rene Russo, Morgan Freeman, K...",233,79,127000,6.6,0114069


In [9]:
dfGenre5 = load_csv_by_genre(listGenres[5])
save_csv(dfGenre5,template_csv.format(listGenres[5]))
dfGenre5

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Euphoria,Sam Levinson,Drama,2019–,55,"[Hunter Schafer, Zendaya, Angus Cloud, Sydney ...",725,44,104000,8.4,8772296
1,Archive 81,Rebecca Sonnenshine,Drama,2022–,60,"[Mamoudou Athie, Dina Shihabi, Evan Jonigkeit,...",414,41,19000,7.5,13365348
2,Encanto,Jared Bush,Animation,2021,102,"[Stephanie Beatriz, María Cecilia Botero, John...",833,193,114000,7.3,2953050
3,Yellowjackets,Ashley Lyle,Drama,2021–,0,"[Melanie Lynskey, Tawny Cypress, Ella Purnell,...",234,24,22000,8.1,11041332
4,After Life,Ricky Gervais,Comedy,2019–2022,30,"[Ricky Gervais, Tom Basden, Tony Way, Diane Mo...",2000,70,112000,8.5,8398600
...,...,...,...,...,...,...,...,...,...,...,...
1207,Mid90s,Jonah Hill,Comedy,2018,85,"[Sunny Suljic, Katherine Waterston, Lucas Hedg...",332,206,59000,7.4,5613484
1208,Moon,Duncan Jones,Drama,2009,97,"[Sam Rockwell, Kevin Spacey, Dominique McEllig...",678,437,350000,7.8,1182345
1209,7th Heaven,Brenda Hampton,Comedy,1996–2007,60,"[Stephen Collins, Catherine Hicks, Jessica Bie...",284,7,25000,5.3,0115083
1210,Babylon 5,J. Michael Straczynski,Action,1993–1998,45,"[Mira Furlan, Richard Biggs, Stephen Furst, An...",240,33,30000,8.3,0105946


In [15]:
dfGenre6 = load_csv_by_genre(listGenres[6])
save_csv(dfGenre6,template_csv.format(listGenres[6]))
dfGenre6

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Archive 81,Rebecca Sonnenshine,Drama,2022–,60,"[Mamoudou Athie, Dina Shihabi, Evan Jonigkeit,...",414,41,19000,7.5,13365348
1,Scream,Matt Bettinelli-Olpin,Horror,2022,114,"[Neve Campbell, Courteney Cox, David Arquette,...",739,224,31000,7.1,11245972
2,Yellowjackets,Ashley Lyle,Drama,2021–,0,"[Melanie Lynskey, Tawny Cypress, Ella Purnell,...",234,24,22000,8.1,11041332
3,The Witcher,Lauren Schmidt,Action,2019–,60,"[Henry Cavill, Freya Allan, Anya Chalotra, Mim...",5200,127,443000,8.2,5180504
4,Brazen,Monika Mitchell,Crime,2022,94,"[Alyssa Milano, Sam Page, Malachi Weir, Emilie...",313,41,-1,3.9,13978306
...,...,...,...,...,...,...,...,...,...,...,...
1209,The Big Sleep,Howard Hawks,Crime,1946,114,"[Humphrey Bogart, Lauren Bacall, John Ridgely,...",306,118,84000,7.9,0038355
1210,30 Monedas,Eduard Fernández,Drama,2020–,60,"[Eduard Fernández, Megan Montaner, Miguel Ánge...",109,15,-1,7.2,9764386
1211,24: Live Another Day,Robert Cochran,Action,2014,46,"[Kiefer Sutherland, Yvonne Strahovski, Tate Do...",72,42,45000,8.3,1598754
1212,,Bartosz M. Kowalski,Adventure,2020,102,"[Julia Wieniawa-Narkiewicz, Michal Lupa, Wikto...",181,35,11000,4.9,11240506


In [16]:
dfGenre7 = load_csv_by_genre(listGenres[7])
save_csv(dfGenre7,template_csv.format(listGenres[7]))
dfGenre7

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",350,21,23000,8.4,13146488
1,Ozark,Bill Dubuque,Crime,2017–2022,60,"[Jason Bateman, Laura Linney, Sofia Hublitz, S...",1500,109,239000,8.4,5071412
2,Brazen,Monika Mitchell,Crime,2022,94,"[Alyssa Milano, Sam Page, Malachi Weir, Emilie...",313,41,-1,3.9,13978306
3,Stay Close,Cush Jumbo,Crime,2021,15,"[Cush Jumbo, Richard Armitage, Daniel Francis,...",480,23,21000,7.0,2201227
4,Dexter: New Blood,Clyde Phillips,Crime,2021–2022,49,"[Michael C. Hall, Jack Alcott, Julia Jones, Jo...",1100,14,67000,8.3,14164730
...,...,...,...,...,...,...,...,...,...,...,...
1210,Cracker,Jimmy McGovern,Crime,1993–1996,100,"[Robbie Coltrane, Geraldine Somerville, Kieran...",40,17,-1,8.4,0105977
1211,The Jesus Rolls,John Turturro,Comedy,2019,85,"[John Turturro, Bobby Cannavale, Audrey Tautou...",106,57,-1,4.4,5974030
1212,Blue Ruin,Jeremy Saulnier,Crime,2013,90,"[Macon Blair, Devin Ratray, Amy Hargreaves, Ke...",250,297,72000,7.1,2359024
1213,Mocro Maffia,Robert de Hoog,Crime,2018–,60,"[Robert de Hoog, Achmed Akkabi, Nasrdin Dchar,...",27,-1,-1,8.2,8810204


In [17]:
dfGenre8 = load_csv_by_genre(listGenres[8])
save_csv(dfGenre8,template_csv.format(listGenres[8]))
dfGenre8

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Encanto,Jared Bush,Animation,2021,102,"[Stephanie Beatriz, María Cecilia Botero, John...",833,193,114000,7.3,2953050
1,Shingeki no Kyojin,Yûki Kaji,Animation,2013–2022,24,"[Yûki Kaji, Marina Inoue, Yui Ishikawa, Josh G...",1600,54,294000,9.0,2560140
2,,Derek Drymon,Animation,2022,87,"[Andy Samberg, Selena Gomez, Kathryn Hahn, Jim...",162,55,15000,6.1,9848626
3,Sing 2,Garth Jennings,Animation,2021,110,"[Matthew McConaughey, Reese Witherspoon, Scarl...",255,96,24000,7.6,6467266
4,The House,Matthew Goode,Animation,2022–,97,"[Matthew Goode, Paul Kaye, Helena Bonham Carte...",89,36,-1,7.0,11703050
...,...,...,...,...,...,...,...,...,...,...,...
1183,,Jake Castorena,Animation,2019,46,"[Jerry O'Connell, Rebecca Romijn, Rainn Wilson...",8,3,-1,7.2,11043546
1184,,John Kricfalusi,Animation,2003,30,"[John Kricfalusi, Eric Bauza, Mike Kricfalusi,...",38,3,2000,5.5,0371475
1185,Corner Gas Animated,Brent Butt,Animation,2018–2021,22,"[Brent Butt, Gabrielle Miller, Fred Ewanuick, ...",40,-1,-1,7.7,6881158
1186,,Hisao Egawa,Animation,1993–1996,22,"[Hisao Egawa, Eriko Hara, Akiko Hiramatsu, Yuk...",16,7,-1,8.7,0965547


In [18]:
dfGenre9 = load_csv_by_genre(listGenres[9])
save_csv(dfGenre9,template_csv.format(listGenres[9]))
dfGenre9

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Eternals,Chloé Zhao,Action,2021,36,"[Gemma Chan, Richard Madden, Angelina Jolie, S...",3000,316,225000,6.5,9032400
1,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",351,21,24000,8.4,13146488
2,The Book of Boba Fett,Jon Favreau,Action,2021–,38,"[Temuera Morrison, Ming-Na Wen, Matt Berry, Fr...",742,2,37000,7.7,13668894
3,Encanto,Jared Bush,Animation,2021,102,"[Stephanie Beatriz, María Cecilia Botero, John...",833,193,115000,7.3,2953050
4,,Jon Watts,Action,2021,28,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",4800,339,423000,8.7,10872600
...,...,...,...,...,...,...,...,...,...,...,...
1160,Halo Infinite,Dan Chosich,Action,2021,0,"[Steve Downes, Jen Taylor, Nicolas Roye, Bruce...",22,5,-1,8.6,9437014
1161,Æon Flux,Karyn Kusama,Action,2005,93,"[Charlize Theron, Frances McDormand, Sophie Ok...",572,75,127000,5.5,0402022
1162,The Sleepover,Trish Sie,Action,2020,100,"[Sadie Stanley, Maxwell Simkins, Cree Cicchino...",146,40,-1,5.7,10888708
1163,LazyTown,Magnús Scheving,Adventure,2002–2014,30,"[Lorraine Parsloe, Magnús Scheving, Stefán Kar...",115,2,-1,6.0,0396991


In [19]:
dfGenre10 = load_csv_by_genre(listGenres[10])
save_csv(dfGenre10,template_csv.format(listGenres[10]))
dfGenre10

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Eternals,Chloé Zhao,Action,2021,36,"[Gemma Chan, Richard Madden, Angelina Jolie, S...",3000,317,225000,6.5,9032400
1,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",353,22,24000,8.4,13146488
2,Encanto,Jared Bush,Animation,2021,102,"[Stephanie Beatriz, María Cecilia Botero, John...",834,193,115000,7.3,2953050
3,,Jon Watts,Action,2021,28,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",4800,339,423000,8.7,10872600
4,The Witcher,Lauren Schmidt,Action,2019–,60,"[Henry Cavill, Freya Allan, Anya Chalotra, Mim...",5200,127,443000,8.2,5180504
...,...,...,...,...,...,...,...,...,...,...,...
1174,The Last Unicorn,Jules Bass,Animation,1982,92,"[Jeff Bridges, Mia Farrow, Angela Lansbury, Al...",176,58,26000,7.4,0084237
1175,Bai She: Yuan qi,Amp Wong,Animation,2019,99,"[Vincent Rodriguez III, Matthew Moy, Stephanie...",55,45,-1,7.1,9288776
1176,Kung Fury,David Sandberg,Short,2015,31,"[David Sandberg, Jorma Taccone, Steven Chew, L...",128,77,58000,8.0,3472226
1177,,Park Bo-Young,Drama,2021–,65,"[Park Bo-Young, Seo In-Guk, Lee Soo-hyuk, Tae-...",71,2,-1,8.1,13669128


In [20]:
dfGenre11 = load_csv_by_genre(listGenres[11])
save_csv(dfGenre11,template_csv.format(listGenres[11]))
dfGenre11

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Archive 81,Rebecca Sonnenshine,Drama,2022–,60,"[Mamoudou Athie, Dina Shihabi, Evan Jonigkeit,...",419,41,19000,7.5,13365348
1,Scream,Matt Bettinelli-Olpin,Horror,2022,114,"[Neve Campbell, Courteney Cox, David Arquette,...",739,225,31000,7.1,11245972
2,Yellowjackets,Ashley Lyle,Drama,2021–,0,"[Melanie Lynskey, Tawny Cypress, Ella Purnell,...",236,24,22000,8.1,11041332
3,Shingeki no Kyojin,Yûki Kaji,Animation,2013–2022,24,"[Yûki Kaji, Marina Inoue, Yui Ishikawa, Josh G...",1600,54,295000,9.0,2560140
4,,Derek Drymon,Animation,2022,87,"[Andy Samberg, Selena Gomez, Kathryn Hahn, Jim...",162,55,15000,6.1,9848626
...,...,...,...,...,...,...,...,...,...,...,...
1193,Days Gone,John Garvin,Action,2019,0,"[Sam Witwer, Jim Pirri, Courtnee Draper, Nishi...",102,5,-1,8.4,6795336
1194,,Bill Motz,Animation,2021–,22,"[Ashly Burch, Dana Snyder, Michaela Dietz, Sum...",20,2,600,7.1,10684374
1195,Tetsuo,Shin'ya Tsukamoto,Horror,1989,67,"[Tomorô Taguchi, Kei Fujiwara, Nobu Kanaoka, S...",164,125,22000,7.0,0096251
1196,,Rachel Talalay,Fantasy,1991,89,"[Robert Englund, Lisa Zane, Shon Greenblatt, L...",311,93,45000,4.9,0101917


add more genre - "super Hero" Genre - which has a different Url 

In [22]:
columns = ['SeriesName', 'Creator', 'Genre', 'YearOfPublished', 'ChapterLengthInMiniutes', 'TopCast', 'UserReviews','CriticReviews', 'Views', 'Rating', 'SeriesId']
urlTemplate_4='https://www.imdb.com/search/keyword/?keywords=superhero&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=a581b14c-5a82-4e29-9cf8-54f909ced9e1&pf_rd_r=5DK0FVYYR5XYQ21GD0TG&pf_rd_s=center-5&pf_rd_t=15051&pf_rd_i=genre&ref_=kw_nxt&sort=moviemeter,asc&mode=detail&page={}'
url_template2= "https://www.imdb.com/{}"
count = 1
superHeroGenre_data_frame = pd.DataFrame(columns=columns)
while count<26:
    link_to_genre_page =[]
    r4=requests.get(urlTemplate_4.format(count))
    soup4 =  BeautifulSoup(r4.content)
    for s in soup4.findAll("div",attrs={"class":"lister-item mode-detail"}):
        link_to_genre_page.append(s.find('a')['href'])
    for url in link_to_genre_page:
        formath=url.split("tt")
        num=formath[1].split("/")
        seriesId= num[0]
        r2=requests.get(url_template2.format(url))
        data = extract_data_html(r2.content) 
        if data is not None:
            if seriesId is not None:
                data.append(seriesId)
            else:
                seriesId =0
            d = {col: data[i] for i, col in enumerate(columns)}
            superHeroGenre_data_frame = superHeroGenre_data_frame.append(d, ignore_index=True)
    count = count + 1 
save_csv(superHeroGenre_data_frame,'superhero')
superHeroGenre_data_frame

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Eternals,Chloé Zhao,Action,2021,36,"[Gemma Chan, Richard Madden, Angelina Jolie, S...",3000,317,227000,6.5,9032400
1,Peacemaker,James Gunn,Action,2022–,40,"[John Cena, Danielle Brooks, Freddie Stroma, C...",357,22,24000,8.4,13146488
2,,Jon Watts,Action,2021,28,"[Tom Holland, Zendaya, Benedict Cumberbatch, J...",4800,339,424000,8.7,10872600
3,The Suicide Squad,James Gunn,Action,2021,12,"[Margot Robbie, Idris Elba, John Cena, Joel Ki...",3300,361,293000,7.2,6334354
4,Daredevil,Drew Goddard,Action,2015–2018,54,"[Charlie Cox, Vincent D'Onofrio, Deborah Ann W...",1100,191,404000,8.6,3322312
...,...,...,...,...,...,...,...,...,...,...,...
1142,,Clare Grant,Comedy,2014–,0,"[Clare Grant, Rileah Vanderbilt, Milynn Sarley...",-1,-1,14,7.3,3532204
1143,,Genndy Tartakovsky,Animation,1999,50,"[Christine Cavanaugh, Eddie Deezen, Jeff Benne...",16,1,-1,7.6,0293092
1144,Mugamoodi,Mysskin,Action,2012,30,"[Jiiva, Narain, Nassar, Pooja Hegde, Selvaah, ...",11,5,-1,5.0,2367996
1145,Infinite Crisis,Cardell Kerr,Action,2015,0,"[Ike Amadi, Robin Atkin Downes, Laura Bailey, ...",-1,-1,91,7.4,4230400


In [24]:
listGenres=['comedy','sci-fi','romance','action','Thriller','drama','mystery','crime','animation','adventure','fantasy','horror']
dfGenre0 = pd.read_csv('comedy.csv')
dfGenre1 = pd.read_csv('sci-fi.csv')
dfGenre2 = pd.read_csv('romance.csv')
dfGenre3 = pd.read_csv('action.csv')
dfGenre4 = pd.read_csv('Thriller.csv')
dfGenre5 = pd.read_csv('drama.csv')
dfGenre6 = pd.read_csv('mystery.csv')
dfGenre7 = pd.read_csv('crime.csv')
dfGenre8 = pd.read_csv('animation.csv')
dfGenre9 = pd.read_csv('adventure.csv')
dfGenre10 = pd.read_csv('fantasy.csv')
dfGenre11 = pd.read_csv('horror.csv')

merge the data into one table 

In [25]:
mergeGenreImdb_df = pd.concat([dfGenre0,dfGenre1,dfGenre2,dfGenre3,dfGenre4,dfGenre5,dfGenre6,dfGenre7,dfGenre8,dfGenre9,dfGenre10,dfGenre11,superHeroGenre_data_frame], ignore_index=True)
mergeGenreImdb_df.to_csv('mergeGenreImdb.csv', index = False,encoding='iso8859-8',errors='replace')

In [27]:
mergeGenreImdb_df

Unnamed: 0.1,Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,0.0,Peacemaker,James Gunn,Action,2022?,40,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",335,21,22000,8.4,13146488
1,1.0,Encanto,Jared Bush,Animation,2021,102,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",827,192,113000,7.3,2953050
2,2.0,After Life,Ricky Gervais,Comedy,2019?2022,30,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",2000,70,112000,8.5,8398600
3,3.0,Don't Look Up,Adam McKay,Comedy,2021,18,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",4100,260,388000,7.3,11286314
4,4.0,How I Met Your Father,Isaac Aptaker,Comedy,2022???,24,"['Hilary Duff', 'Christopher Lowell', 'Francia...",301,1,-1,5.1,14500082
...,...,...,...,...,...,...,...,...,...,...,...,...
15478,,,Clare Grant,Comedy,2014–,0,"[Clare Grant, Rileah Vanderbilt, Milynn Sarley...",-1,-1,14,7.3,3532204
15479,,,Genndy Tartakovsky,Animation,1999,50,"[Christine Cavanaugh, Eddie Deezen, Jeff Benne...",16,1,-1,7.6,0293092
15480,,Mugamoodi,Mysskin,Action,2012,30,"[Jiiva, Narain, Nassar, Pooja Hegde, Selvaah, ...",11,5,-1,5.0,2367996
15481,,Infinite Crisis,Cardell Kerr,Action,2015,0,"[Ike Amadi, Robin Atkin Downes, Laura Bailey, ...",-1,-1,91,7.4,4230400


add more data to the table

In [30]:
columns = ['SeriesName', 'Creator', 'Genre', 'YearOfPublished', 'ChapterLengthInMiniutes', 'TopCast', 'UserReviews','CriticReviews', 'Views', 'Rating', 'SeriesId']
urlTemplate_5='https://www.imdb.com/chart/bottom?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=9QAJSG2RY0XS2Y0NK4GQ&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_8'
url_template2= "https://www.imdb.com/{}"
more_data_frame = pd.DataFrame(columns=columns)
link_to_genre_page =[]
r5=requests.get(urlTemplate_5)
soup5 =  BeautifulSoup(r5.content)
for s in soup5.findAll("td",attrs={"class":"titleColumn"}):
    link_to_genre_page.append(s.find('a')['href'])
for url in link_to_genre_page:
    formath=url.split("tt")
    num=formath[1].split("/")
    seriesId= num[0]
    r2=requests.get(url_template2.format(url))
    data = extract_data_html(r2.content) 
    if data is not None:
        if seriesId is not None:
            data.append(seriesId)
        else:
            seriesId =0
        d = {col: data[i] for i, col in enumerate(columns)}
        more_data_frame = more_data_frame.append(d, ignore_index=True)
save_csv(more_data_frame,'moreData')
more_data_frame


Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Disaster Movie,Jason Friedberg,Comedy,2008,87,"[Carmen Electra, Vanessa Lachey, Nicole Parker...",471,105,90000,1.9,1213644
1,,Bob Clark,Comedy,2004,88,"[Jon Voight, Scott Baio, Vanessa Angel, Skyler...",146,36,31000,1.9,0270846
2,Kod Adi K.O.Z.,Celal Çimen,Crime,2015,114,"[Cem Kurtoglu, Hakan Ural, Hazim Körmükcü, Tol...",73,3,28000,2.0,4458206
3,,Harold P. Warren,Horror,1966,70,"[Tom Neyman, John Reynolds, Diane Adelson, Har...",737,113,36000,1.8,0060666
4,Saving Christmas,Darren Doane,Comedy,2014,79,"[Kirk Cameron, Darren Doane, Bridgette Cameron...",190,30,16000,1.4,4009460
...,...,...,...,...,...,...,...,...,...,...,...
95,The Love Guru,Marco Schnabel,Comedy,2008,87,"[Mike Myers, Jessica Alba, Romany Malco, Jessi...",310,148,52000,3.8,0811138
96,,Brian Levant,Comedy,2000,90,"[Mark Addy, Stephen Baldwin, Kristen Johnston,...",107,54,23000,3.7,0158622
97,,Andrzej Bartkowiak,Action,2009,96,"[Kristin Kreuk, Neal McDonough, Michael Clarke...",201,96,24000,3.7,0891592
98,Stan Helsing,Bo Zenga,Comedy,2009,108,"[Steve Howey, Diora Baird, Kenan Thompson, Des...",94,48,13000,3.6,1185266


In [29]:
columns = ['SeriesName', 'Creator', 'Genre', 'YearOfPublished', 'ChapterLengthInMiniutes', 'TopCast', 'UserReviews','CriticReviews', 'Views', 'Rating', 'SeriesId']
urlTemplate_4='https://www.imdb.com/search/title/?groups=bottom_250&start={}&ref_=adv_nxt'
url_template2= "https://www.imdb.com/{}"
count = 1
nextPage=1
more_data_frame2 = pd.DataFrame(columns=columns)
while count<6:
    link_to_genre_page =[]
    r4=requests.get(urlTemplate_4.format(nextPage))
    soup4 = BeautifulSoup(r4.content)
    for s in soup4.findAll("div",attrs={"class":"lister-item-content"}):
        link_to_genre_page.append(s.find('a')['href'])
    for url in link_to_genre_page:
        formath=url.split("tt")
        num=formath[1].split("/")
        seriesId= num[0]
        r2=requests.get(url_template2.format(url))
        data = extract_data_html(r2.content) 
        if data is not None:
            if seriesId is not None:
                data.append(seriesId)
            else:
                seriesId =0
            d = {col: data[i] for i, col in enumerate(columns)}
            more_data_frame2 = more_data_frame2.append(d, ignore_index=True)
    count = count + 1
    nextPage=nextPage+1
save_csv(more_data_frame2,'moreData2')
more_data_frame2


Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,365 dni,Barbara Bialowas,Drama,2020,114,"[Anna Maria Sieklucka, Michele Morrone, Bronis...",1400,50,72000,3.3,10886166
1,Fifty Shades of Grey,Sam Taylor-Johnson,Drama,2015,5,"[Dakota Johnson, Jamie Dornan, Jennifer Ehle, ...",1600,404,311000,4.1,2322441
2,Street Fighter,Steven E. de Souza,Action,1994,102,"[Jean-Claude Van Damme, Raul Julia, Ming-Na We...",345,88,70000,4.0,0111301
3,He's All That,Mark Waters,Comedy,2021,88,"[Addison Rae, Tanner Buchanan, Madison Pettis,...",840,55,22000,4.3,4590256
4,Fantastic Four,Josh Trank,Action,2015,100,"[Miles Teller, Kate Mara, Michael B. Jordan, J...",930,417,161000,4.2,1502712
...,...,...,...,...,...,...,...,...,...,...,...
245,Bratz,Sean McNamara,Comedy,2007,110,"[Skyler Shaye, Janel Parrish, Logan Browning, ...",128,59,23000,3.2,0804452
246,,Dee Rees,Crime,2020,115,"[Anne Hathaway, Ben Affleck, Rosie Perez, Will...",422,54,15000,4.3,7456312
247,Meet the Spartans,Jason Friedberg,Comedy,2008,87,"[Sean Maguire, Kevin Sorbo, Carmen Electra, Ke...",448,111,107000,2.8,1073498
248,Supergirl,Jeannot Szwarc,Action,1984,4,"[Helen Slater, Faye Dunaway, Peter O'Toole, Mi...",186,99,22000,4.4,0088206


In [47]:
#mergeGenreImdb_df=pd.read_csv('mergeGenreImdb.csv')

In [31]:
mergeGenreImdbFinal_df= pd.concat([mergeGenreImdb_df,more_data_frame,more_data_frame2], ignore_index=True)


In [38]:
mergeGenreImdbFinal_df

Unnamed: 0,SeriesName,Creator,Genre,YearOfPublished,ChapterLengthInMiniutes,TopCast,UserReviews,CriticReviews,Views,Rating,SeriesId
0,Peacemaker,James Gunn,Action,2022?,40,"['John Cena', 'Danielle Brooks', 'Freddie Stro...",335,21,22000,8.4,13146488
1,Encanto,Jared Bush,Animation,2021,102,"['Stephanie Beatriz', 'Mar?a Cecilia Botero', ...",827,192,113000,7.3,2953050
2,After Life,Ricky Gervais,Comedy,2019?2022,30,"['Ricky Gervais', 'Tom Basden', 'Tony Way', 'D...",2000,70,112000,8.5,8398600
3,Don't Look Up,Adam McKay,Comedy,2021,18,"['Leonardo DiCaprio', 'Jennifer Lawrence', 'Me...",4100,260,388000,7.3,11286314
4,How I Met Your Father,Isaac Aptaker,Comedy,2022???,24,"['Hilary Duff', 'Christopher Lowell', 'Francia...",301,1,-1,5.1,14500082
...,...,...,...,...,...,...,...,...,...,...,...
15828,Bratz,Sean McNamara,Comedy,2007,110,"['Skyler Shaye', 'Janel Parrish', 'Logan Brown...",128,59,23000,3.2,804452
15829,,Dee Rees,Crime,2020,115,"['Anne Hathaway', 'Ben Affleck', 'Rosie Perez'...",422,54,15000,4.3,7456312
15830,Meet the Spartans,Jason Friedberg,Comedy,2008,87,"['Sean Maguire', 'Kevin Sorbo', 'Carmen Electr...",448,111,107000,2.8,1073498
15831,Supergirl,Jeannot Szwarc,Action,1984,4,"['Helen Slater', 'Faye Dunaway', ""Peter O'Tool...",186,99,22000,4.4,88206


the final version of the data 

In [39]:
mergeGenreImdbFinal_df.to_csv('mergeGenreImdbFinal.csv', index = False,encoding='iso8859-8',errors='replace')

End of Data Acquisition