In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/top-250-imdb-tv-shows/IMDB_Top250_Tvshows.csv


In [2]:
df = pd.read_csv('/kaggle/input/top-250-imdb-tv-shows/IMDB_Top250_Tvshows.csv', encoding='ISO-8859-1')

# Display the first few rows of the DataFrame
print(df.head())

                Titile       Year Total_episodes Age  Rating Vote_count  \
0      1. Breaking Bad  20082013         62 eps  18     9.5     (2.2M)   
1   2. Planet Earth II       2016          6 eps  PG     9.5     (159K)   
2      3. Planet Earth       2006         11 eps  PG     9.4     (221K)   
3  4. Band of Brothers       2001         10 eps  15     9.4     (533K)   
4         5. Chernobyl       2019          5 eps  15     9.3     (876K)   

         Category  
0       TV Series  
1  TV Mini Series  
2  TV Mini Series  
3  TV Mini Series  
4  TV Mini Series  


In [3]:
# Rename the column `Titile` to `Title`
df = df.rename(columns={'Titile': 'Title'})

# Extract the starting year from the `Year` column
df['Year'] = df['Year'].astype(str).str.split('–', expand=True)[0]
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Remove non-numeric characters from `Vote_count`
df['Vote_count'] = df['Vote_count'].astype(str).str.replace(r'[(),MK]', '', regex=True)

# Replace 'M' with 'e6' and 'K' with 'e3'
df['Vote_count'] = df['Vote_count'].str.replace('M', 'e6', regex=False).str.replace('K', 'e3', regex=False)

# Convert `Vote_count` to numeric
df['Vote_count'] = pd.to_numeric(df['Vote_count'], errors='coerce')

# Calculate and print the number of missing values in each column
print(f"Missing values per column:\n{df.isnull().sum()}")

# Filter out rows where `Year` or `Vote_count` is NaN
df.dropna(subset=['Year', 'Vote_count'], inplace=True)

# Print the data types of all columns
print(f"\nData types of all columns:\n{df.dtypes}")

Missing values per column:
Title               0
Year              207
Total_episodes      0
Age                 6
Rating              0
Vote_count          0
Category            0
dtype: int64

Data types of all columns:
Title              object
Year              float64
Total_episodes     object
Age                object
Rating            float64
Vote_count        float64
Category           object
dtype: object


In [4]:
import altair as alt
import pandas as pd

# Assuming df is your DataFrame loaded from a CSV or other source
# df = pd.read_csv('your_file.csv')

# 1. Histogram of Rating distribution
chart1 = alt.Chart(df).mark_bar().encode(
    x=alt.X('Rating:Q', bin=True, title='Rating'),
    y=alt.Y('count()', title='Number of Shows'),
    tooltip=[alt.Tooltip('Rating:Q', bin=True, title='Rating'), 'count()']
).properties(
    title='Distribution of Ratings'
).interactive()


# 2. Scatter plot of Rating vs. Vote_count
chart2 = alt.Chart(df).mark_circle().encode(
    x=alt.X('Rating:Q', title='Rating'),
    y=alt.Y('Vote_count:Q', title='Vote Count'),
    tooltip=['Title', 'Rating', 'Vote_count']
).properties(
    title='Rating vs. Vote Count'
).interactive()

# 3. Line plot of average Rating by Year
chart3 = alt.Chart(df).mark_line(point=True).encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('mean(Rating):Q', title='Average Rating'),
    tooltip=['Year', alt.Tooltip('mean(Rating):Q', title='Average Rating')]
).properties(
    title='Average Rating by Year'
).interactive()

# 4. Bar chart of the number of shows per Age category
chart4 = alt.Chart(df).mark_bar().encode(
    x=alt.X('Age:N', title='Age Category', sort='-y'),
    y=alt.Y('count()', title='Number of Shows'),
    tooltip=['Age', 'count()']
).properties(
    title='Number of Shows per Age Category'
).interactive()


# Display the charts
chart1.display()
chart2.display()
chart3.display()
chart4.display()


In [5]:
import altair as alt
import pandas as pd

# Assuming df is your DataFrame loaded from a CSV or other source
# df = pd.read_csv('your_file.csv')

# Data preprocessing steps
df['Total_episodes'] = df['Total_episodes'].astype(str).str.replace(' eps', '', regex=False)
df['Total_episodes'] = pd.to_numeric(df['Total_episodes'], errors='coerce')
df['Total_episodes'] = df['Total_episodes'].fillna(0)

# 1. Scatter plot of Rating vs. Total_episodes
chart1 = alt.Chart(df).mark_circle().encode(
    x=alt.X('Total_episodes:Q', title='Total Episodes'),
    y=alt.Y('Rating:Q', title='Rating'),
    tooltip=['Title', 'Rating', 'Total_episodes'],
    color=alt.value('skyblue')
).properties(
    title='Rating vs. Total Episodes'
).interactive()

# Display the scatter plot
chart1.display()

# 2. Histogram of the number of shows per year
chart2 = alt.Chart(df).mark_bar().encode(
    x=alt.X('Year:O', title='Year'),
    y=alt.Y('count()', title='Number of Shows'),
    tooltip=['Year', 'count()'],
    color=alt.value('skyblue')
).properties(
    title='Number of Shows per Year'
).interactive()

# Display the histogram
chart2.display()

# 3. Bar chart of top 10 highest-rated shows
top_10_rated = df.nlargest(10, 'Rating')
chart3 = alt.Chart(top_10_rated).mark_bar().encode(
    x=alt.X('Title:N', sort='-y', title='Title'),
    y=alt.Y('Rating:Q', title='Rating'),
    tooltip=['Title', 'Rating'],
    color=alt.value('skyblue')
).properties(
    title='Top 10 Highest Rated Shows'
).interactive()

# Display the bar chart
chart3.display()

# 4. Bar chart of top 10 shows with the most votes
top_10_votes = df.nlargest(10, 'Vote_count')
chart4 = alt.Chart(top_10_votes).mark_bar().encode(
    x=alt.X('Title:N', sort='-y', title='Title'),
    y=alt.Y('Vote_count:Q', title='Vote Count'),
    tooltip=['Title', 'Vote_count'],
    color=alt.value('skyblue')
).properties(
    title='Top 10 Shows with Most Votes'
).interactive()

# Display the bar chart
chart4.display()