In [34]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [35]:
url = "https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure" #the url of the first page I would be scraping. In this link, &start = 0, but it is hidden.

In [36]:
TV_Shows = [] #A list to store the unique tv_shows

In [37]:
imbd_content = requests.get(url).content #Here, I send a request for each page.

soup = BeautifulSoup(imbd_content, "lxml")

In [38]:
def imbdScraper():

    for page in range(1,31):
        page_url = f"{url}&start={(page-1)*50 + 1}"  #For each page, the code constructs a unique URL that points to the page
        print(f"Scraping page {page}: {page_url}")

        imbd_content = requests.get(page_url).content #Here, I send a request for each page.

        soup = BeautifulSoup(imbd_content, "lxml")

        results = soup.find_all("div", class_ = "lister-item mode-advanced") #find all div tags with the 'lister-item mode advanced' class



        for result in results:     #for every block in results above,
            show_title = result.h3.a.text    #store the title of the TV show on that block
            Genre = result.find("span", class_ = "genre").text.strip() #store the genre of the movie on that block
            Run_Time = result.find("span", class_="runtime")  #store the run time of the movie on that block
            if Run_Time:     #if that block has a run time, store it
                Run_Time = Run_Time.text.strip()   
            else:            
                Run_Time = "Run Time N/A"   #if the block doesn't have a runtime, say "Run Time N/A"
            Rating = result.find("div", class_ = "inline-block ratings-imdb-rating")
            if Rating:      #if that block has a Rating, store it
                Rating = Rating.text.strip()
            else:
                Rating = "Rating N/A"  #else say "Rating N/A"
            Number_Of_Votes = result.find("p", class_ = "sort-num_votes-visible")  #Store the number of votes that movie block has
            if Number_Of_Votes:   
                Number_Of_Votes = Number_Of_Votes.text.split()[1] #if it has votes, store it
            else:
                Number_Of_Votes = "Votes N/A"  #else, say 'Votes N/A'
            Year = result.find('span', class_ = "lister-item-year text-muted unbold").text #store the year that movie was released
            Synopsis = result.find_all("p", class_ = "text-muted")[1].text.strip() #store the synopsis of that movie
            Actor = result.find("p", class_ = "").text.strip().split(",") 
            Actors = ", ".join([actor.replace("Stars:", "").strip() for actor in Actor]) #the actors of that particular movie

            try:
                certificate = result.find("span", class_="certificate").text.strip() #if the movie has a certificate, store it
            except:
                certificate = "Certificate N/A" #else say 'Certificate N/A'

            Image = result.find("div", class_ ="lister-item-image float-left") #store the cover image of each movie block
            if Image:
                image = Image.select_one("img")['src']
            else:
                image = "Image N/A"  #if the movie block has no image, say "Image N/A"


            tv_show = {'Title': show_title, "Genre": Genre, "Run Time": Run_Time, "Rating": Rating,
                      "Number of Votes": Number_Of_Votes, "Year": Year, "Certificate": certificate, "Synopsis": Synopsis, "Actors": Actors, "Image": image}
            #I use this dictionary- tv_show to store the details of that movie block
            
            TV_Shows.append(tv_show) #I add this dictionary to my list of TV shows, such that every TV_show has its details stored in its own dictionary
            #Once I finished adding all the dictionaries to my list of TV_shows, I jump out of the loop as there are no more movies on that page.
        
        
        
        next_button = soup.find("div", class_ = "desc").find("a", class_ = "lister-page-next next-page")
        #This is where things get interesting. I use Beautiful Soup to check if there is a "Next Page" button on that particular page. 
        if next_button:
            nextpage_url = "https://www.imdb.com" + next_button["href"] #If there is a next button, concatenate the domain name of the website to the content of the href attribute,
                                                                        #such that when a request is sent, it goes to that address.
            imbd_content = requests.get(nextpage_url).content  #to get the content of the next page, a request is sent to the web address of that next page.
            soup = BeautifulSoup(imbd_content, "lxml")
            time.sleep(3)  #after 3 seconds, a new request is sent to the next page.
        else:
            break    #if there is no next button, the loop breaks

    TV_ShowsDF = pd.DataFrame(TV_Shows) #Here, I convert the scraped data(which is already stored in TV_Shows) into a pandas DataFrame
    TV_ShowsDF.to_csv('tv_shows.csv', index=False)
    return TV_ShowsDF    #The DataFrame is returned each time this function is called.

In [39]:
TV_ShowsDF = imbdScraper()

Scraping page 1: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=1
Scraping page 2: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=51
Scraping page 3: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=101
Scraping page 4: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=151
Scraping page 5: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=201
Scraping page 6: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=251
Scraping page 7: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=301
Scraping page 8: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genres=adventure&start=351
Scraping page 9: https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&genr

# SOME INFORMATION ON OUR DATAFRAME

In [42]:
# Convert the list of dictionaries (TV_Shows) into a DataFrame and name it TV_ShowsDF
TV_ShowsDF = pd.DataFrame(TV_Shows)

# Display basic statistics of numeric columns
numeric_columns = ['Rating', 'Number of Votes']
TV_ShowsDF[numeric_columns].describe()

Unnamed: 0,Rating,Number of Votes
count,1500.0,1500
unique,69.0,1363
top,7.2,Votes N/A
freq,77.0,63


In [43]:
# Count unique values in the 'Certificate' column
certificate_counts = TV_ShowsDF['Certificate'].value_counts()

# Display the most common certificates
print("Most common certificates:")
certificate_counts.head()

Most common certificates:


Certificate N/A    1484
PG                    6
PG-13                 3
15                    2
G                     2
Name: Certificate, dtype: int64

In [44]:
# Count unique values in the 'Genre' column (splitting by commas)
genres = TV_ShowsDF['Genre'].str.split(', ').explode()
genre_counts = genres.value_counts()

# Display the most common genres
print("\nMost common genres:")
genre_counts.head()


Most common genres:


Adventure    1455
Action        890
Animation     743
Comedy        335
Drama         333
Name: Genre, dtype: int64

In [46]:
# Print summary information about the DataFrame
print("\nDataFrame summary:")
TV_ShowsDF.info()


DataFrame summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            1500 non-null   object
 1   Genre            1500 non-null   object
 2   Run Time         1500 non-null   object
 3   Rating           1500 non-null   object
 4   Number of Votes  1500 non-null   object
 5   Year             1500 non-null   object
 6   Certificate      1500 non-null   object
 7   Synopsis         1500 non-null   object
 8   Actors           1500 non-null   object
 9   Image            1500 non-null   object
dtypes: object(10)
memory usage: 117.3+ KB


In [47]:
# Display the first few rows of the DataFrame
print("\nFirst few rows of the DataFrame:")
TV_ShowsDF.head()


First few rows of the DataFrame:


Unnamed: 0,Title,Genre,Run Time,Rating,Number of Votes,Year,Certificate,Synopsis,Actors,Image
0,One Piece,"Action, Adventure, Comedy",60 min,8.5,59605,(2023– ),Certificate N/A,"In a seafaring world, a young pirate captain s...","Iñaki Godoy, Emily Rudd, Mackenyu, Vincent Regan",https://m.media-amazon.com/images/S/sash/4Fyxw...
1,Ahsoka,"Action, Adventure, Drama",Run Time N/A,8.0,26398,(2023– ),Certificate N/A,"After the fall of the Galactic Empire, former ...","Rosario Dawson, Natasha Liu Bordizzo, Mary Eli...",https://m.media-amazon.com/images/S/sash/4Fyxw...
2,Who Is Erin Carter?,"Action, Adventure, Crime",338 min,6.5,7560,(2023),Certificate N/A,"Erin Carter, a British teacher in Spain, finds...","Evin Ahmad, Sean Teale, Denise Gough, Indica W...",https://m.media-amazon.com/images/S/sash/4Fyxw...
3,The Wheel of Time,"Action, Adventure, Drama",60 min,7.1,115332,(2021– ),Certificate N/A,Set in a high fantasy world where magic exists...,"Rosamund Pike, Daniel Henney, Madeleine Madden...",https://m.media-amazon.com/images/S/sash/4Fyxw...
4,Game of Thrones,"Action, Adventure, Drama",57 min,9.2,2200627,(2011–2019),18,Nine noble families fight for control over the...,"Emilia Clarke, Peter Dinklage, Kit Harington, ...",https://m.media-amazon.com/images/S/sash/4Fyxw...
