In [12]:
import csv
import os
import sys
import pandas as pd
import numpy as np

In [4]:
#read the csv file
def read_csv(file):
    data = pd.read_csv(file)
    return data


In [7]:
file = "TV Series.csv"
df = read_csv(file)
print(df.head())

      Series Title Release Year Runtime                    Genre Rating  \
0        Wednesday     (2022– )  45 min   Comedy, Crime, Fantasy    8.2   
1      Yellowstone     (2018– )  60 min           Drama, Western    8.7   
2  The White Lotus  (2021–2023)  60 min            Comedy, Drama    7.9   
3             1923  (2022–2023)  60 min           Drama, Western    8.6   
4        Jack Ryan     (2018– )  60 min  Action, Drama, Thriller    8.0   

                                                Cast  \
0  Jenna Ortega, Hunter Doohan, Percy Hynes White...   
1  Kevin Costner, Luke Grimes, Kelly Reilly, Wes ...   
2  Jennifer Coolidge, Jon Gries, F. Murray Abraha...   
3  Harrison Ford, Helen Mirren, Brandon Sklenar, ...   
4  John Krasinski, Wendell Pierce, Michael Kelly,...   

                                            Synopsis  
0  Follows Wednesday Addams' years as a student, ...  
1  A ranching family in Montana faces off against...  
2  Set in a tropical resort, it follows the exp

In [8]:
df.shape

(50000, 7)

In [9]:
df.sample(10)

Unnamed: 0,Series Title,Release Year,Runtime,Genre,Rating,Cast,Synopsis
23014,Slow Horses,(2022– ),****,"Drama, Thriller",7.9,"Gary Oldman, Jack Lowden, Kristin Scott Thomas...",Follows a team of British intelligence agents ...
42473,House of the Dragon,(2022– ),****,"Action, Adventure, Drama",8.5,"Rhys Ifans, Matt Smith, Fabien Frankel, Graham...",An internal succession war within House Targar...
11952,The White Lotus,(2021–2023),60 min,"Comedy, Drama",7.9,"Jennifer Coolidge, Jon Gries, F. Murray Abraha...","Set in a tropical resort, it follows the explo..."
43278,Black Snow,(2023– ),52 min,Crime,7.4,"Travis Fimmel, Brooke Satchwell, Kestie Morass...","In 1995, seventeen-year-old Isabel Baker was m..."
24724,The Crown,(2016– ),58 min,"Biography, Drama, History",8.7,"Claire Foy, Olivia Colman, Imelda Staunton, Ma...",Follows the political rivalries and romance of...
42327,Better Call Saul,(2015–2022),46 min,"Crime, Drama",8.9,"Bob Odenkirk, Rhea Seehorn, Jonathan Banks, Pa...",The trials and tribulations of criminal lawyer...
17290,The West Wing,(1999–2006),44 min,Drama,8.9,"Martin Sheen, Rob Lowe, Allison Janney, John S...",Inside the lives of staffers in the West Wing ...
20618,Anne Rice's Mayfair Witches,(2023– ),****,"Fantasy, Horror",6.8,"Alexandra Daddario, Jack Huston, Tongayi Chiri...",Follows a neurosurgeon who discovers she is th...
3655,Batwheels,(2022– ),12 min,"Animation, Action, Adventure",6.4,"Jacob Bertrand, Kimberly Brooks, Mick Wingert,...","Bam, Redbird, Bibi, Batwing, and Buff are thru..."
31569,1899,(2022),60 min,"Drama, Mystery",7.4,"Emily Beecham, Aneurin Barnard, Andreas Pietsc...",Multinational immigrants traveling from the ol...


In [17]:
def tweak_imdb_df(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df
        # Formatting column names (Series Title -> series_title)
        .rename(columns=lambda column_: column_.lower().replace(" ", "_"))
        # Replacing "****" values, converting column type and extracting relevant info
        .assign(rating=lambda df_: df_.rating.replace("****", np.NaN).astype("float32"),
                runtime=lambda df_: df_.runtime.replace("****", np.NaN).str.extract(r"(\d+) min").astype("float").astype("Int64"),
                end_year=lambda df_: df_.release_year.str.extract(r"-?(\d+)\)").astype("float").astype("Int64"),
                release_year=lambda df_: df_.release_year.str.extract(r"\((\d+)–?").astype("float").astype("Int64"))
    
        # Dropping series without release year
        .dropna(subset=["release_year"])
        .dropna(subset=["end_year"])

        # Dropping duplicated rows
        .drop_duplicates(subset="series_title")

        .assign(duration= lambda df_: df_.end_year - df_.release_year)
    )

In [20]:
transformed_df = tweak_imdb_df(df)
print(transformed_df.shape)
transformed_df.sample(3)

(5952, 9)


Unnamed: 0,series_title,release_year,runtime,genre,rating,cast,synopsis,end_year,duration
2815,Guilty Party,2021,30,"Comedy, Crime, Mystery",5.6,"Kate Beckinsale, Jules Latimer, Tiya Sircar, L...","A discredited journalist, desperate to salvage...",2021,0
3078,The Lucy Show,1962,30,Comedy,7.2,"Lucille Ball, Gale Gordon, Vivian Vance, Jimmy...",The wacky misadventures of a forever-scheming ...,1968,6
6100,F.C. De Kampioenen,1990,33,"Comedy, Romance, Sport",6.7,"Danni Heylen, Marijn De Valck, An Swartenbroek...",Oscar Crucke is the coach of substandard enthu...,2020,30


In [23]:
#create a new df with each genre in a new row and series title, rating, release year and end year
def explode_genres(df: pd.DataFrame) -> pd.DataFrame:
    return (
        df
        .assign(genre=df.genre.str.split(", "))
        .explode("genre")
    )

exploded_df = explode_genres(transformed_df)
print(exploded_df.shape)
exploded_df.sample(3)

(13629, 9)


Unnamed: 0,series_title,release_year,runtime,genre,rating,cast,synopsis,end_year,duration
2514,Gwimul,2021,70,Drama,8.1,"Shin Ha-kyun, Yeo Jin-gu, Choi Dae-hoon, Choi ...",The story of two fearless men who are willing ...,2021,0
288,Gunsmoke,1955,60,Western,8.1,"James Arness, Milburn Stone, Amanda Blake, Ken...",Marshal Matt Dillon keeps the peace in rough-a...,1975,20
868,Jurassic World: Camp Cretaceous,2020,24,Animation,7.5,"Paul-Mikél Williams, Kausar Mohammed, Raini Ro...",Six teens attending an adventure camp on the o...,2022,2


In [34]:
number_of_unique_genres = exploded_df.genre.nunique()
print(f"Number of unique genres: {number_of_unique_genres}")

number_of_unique_release_years = exploded_df.release_year.nunique()
print(f"Number of unique release years: {number_of_unique_release_years}")

#print uniqe genres
unique_genres = exploded_df.genre.unique()
print(unique_genres)

Number of unique genres: 26
Number of unique release years: 79
['Comedy' 'Drama' 'Western' 'Horror' 'Thriller' 'Action' 'Adventure'
 'Family' 'Mystery' 'Crime' 'Fantasy' 'Documentary' 'History' 'Romance'
 'Sci-Fi' 'Biography' 'Animation' 'Music' 'Sport' 'War' 'Short'
 'Reality-TV' 'Musical' 'Talk-Show' 'Game-Show' 'News']


In [25]:
columns_reodered = ['series_title','runtime','genre','rating', 'release_year', 'end_year','duration', 'cast','synopsis']

In [26]:
tv_series = exploded_df.reindex(columns = columns_reodered)
tv_series.to_csv('tv_series.csv', index=False)


In [27]:
print(tv_series.shape)
tv_series.sample(10)

(13629, 9)


Unnamed: 0,series_title,runtime,genre,rating,release_year,end_year,duration,cast,synopsis
2678,Weird Science,30,Sci-Fi,6.8,1994,1998,4,"Michael Manasseri, John Asher, Vanessa Angel, ...",Gary Wallace and Wyatt Donnelly create their d...
7362,Nobodies,30,Comedy,5.9,2017,2018,1,"Hugh Davidson, Larry Dorf, Rachel Ramras, Jill...",A group of friends works together on a childre...
4093,Prince Caspian and the Voyage of the Dawn Treader,30,Adventure,7.0,1989,1989,0,"Warwick Davis, Jonathan R. Scott, Sophie Wilco...",Young Prince Caspian of Narnia wonders and dre...
1920,Lovesick,24,Comedy,8.0,2014,2018,4,"Johnny Flynn, Antonia Thomas, Daniel Ings, Jos...","After finding out he has an STD, Dylan must ge..."
294,The Good Place,22,Drama,8.2,2016,2020,4,"Kristen Bell, William Jackson Harper, Jameela ...",Four people and their otherworldly frienemy st...
4216,The Returned,42,Horror,7.0,2015,2015,0,"Kevin Alejandro, Agnes Bruckner, India Ennenga...",A small town's residents are stunned when rece...
6747,Longstreet,90,Action,7.9,1971,1972,1,"James Franciscus, Marlyn Mason, Peter Mark Ric...",The cases of a blind insurance investigator.
1929,The Client List,42,Drama,6.5,2011,2013,2,"Jennifer Love Hewitt, Loretta Devine, Colin Eg...",Riley is a single mother living in a small Tex...
8328,The Nightmare Room,30,Fantasy,7.2,2001,2002,1,"James Avery, Keiko Agena, Kyle Gibson, Michael...",Anthology series centered on teens facing supe...
3163,The Last Tycoon,61,Drama,7.6,2016,2017,1,"Matt Bomer, Kelsey Grammer, Lily Collins, Domi...",Centers on Hollywood's first wunderkind studio...


In [32]:
#number of unique series_title with rating greater than 8

number_of_unique_series_title = tv_series[tv_series.rating > 8.5].series_title.nunique()
print(f"Number of unique series title with rating greater than 8: {number_of_unique_series_title}")

Number of unique series title with rating greater than 8: 275


In [28]:
file2 = "movies.csv"
df2 = read_csv(file2)
df2.sample(10)

Unnamed: 0,Rank,Movie_ID,Movie_Name,Director,Year,US_Distributor,Lifetime_Gross,Budget,MPAA,Running_Time,Genre
575,576,tt0119116,The Fifth Element,Luc Besson,1997,Sony Pictures Releasing,264029966,,PG-13,2 hr 6 min,"Action, Adventure, Sci-Fi"
350,351,tt0106977,The Fugitive,Andrew Davis,1993,Warner Bros.,368875760,,PG-13,2 hr 10 min,"Action, Crime, Drama, Mystery, Thriller"
156,157,tt0381061,Casino Royale,Martin Campbell,2006,Sony Pictures Releasing,616501619,"$150,000,000",PG-13,2 hr 24 min,"Action, Adventure, Thriller"
79,80,tt1375666,Inception,Christopher Nolan,2010,Warner Bros.,829895144,"$160,000,000",PG-13,2 hr 28 min,"Action, Adventure, Sci-Fi, Thriller"
926,927,tt1001508,He's Just Not That Into You,Ken Kwapis,2009,Warner Bros.,178866158,,PG-13,2 hr 9 min,"Comedy, Drama, Romance"
834,835,tt0258000,Panic Room,David Fincher,2002,Sony Pictures Releasing,197079546,"$48,000,000",R,1 hr 52 min,"Crime, Drama, Thriller"
283,284,tt1502397,Bad Boys for Life,"Adil El Arbi, Bilall Fallah",2020,Sony Pictures Releasing,419074646,"$90,000,000",R,2 hr 4 min,"Action, Comedy, Crime, Thriller"
705,706,tt0489099,Jumper,Doug Liman,2008,Twentieth Century Fox,225132113,"$85,000,000",PG-13,1 hr 28 min,"Action, Adventure, Sci-Fi, Thriller"
682,683,tt0425061,Get Smart,Peter Segal,2008,Warner Bros.,230685453,"$80,000,000",PG-13,1 hr 50 min,"Action, Adventure, Comedy"
100,101,tt0382625,The Da Vinci Code,Ron Howard,2006,Sony Pictures Releasing,760006945,"$125,000,000",PG-13,2 hr 29 min,"Mystery, Thriller"
