In [1]:
import pandas as pd # for data manipulation
import numpy as np  # for numerical operations
import seaborn as sns  # Importing Seaborn library for data visualization
import matplotlib.pyplot as plt  # Importing Matplotlib library for plotting
import re # Import the regex module for pattern matching
import warnings
warnings.filterwarnings('ignore')# filter out warnings

# Import data

In [2]:
# Load the dataset from a CSV file
movies_data = pd.read_csv('n_movies.csv')
movies_data.head()

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [3]:
# Check rows
movies_data.shape

(9957, 9)

# Checking Datatype

In [7]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


In [8]:
movies_data.dtypes

title           object
year            object
certificate     object
duration        object
genre           object
rating         float64
description     object
stars           object
votes           object
dtype: object

# Rename the columns of the DataFrame for better readability and consistency.

In [9]:
# Check column name
movies_data.columns

Index(['title', 'year', 'certificate', 'duration', 'genre', 'rating',
       'description', 'stars', 'votes'],
      dtype='object')

In [10]:
# Define two lists to categorize columns in the dataset:
# 'required_column' contains the essential columns needed for the analysis or processing,
# while 'unrequired_column' lists the non-essential columns that can be excluded or ignored if necessary.
required_column = ['title', 'genre', 'rating','year','description','votes', 'certificate', 'duration']
unrequired_column = ['stars']

In [11]:
# Create a new DataFrame containing only the essential columns specified in the 'required_column' list.
# This helps in focusing on relevant data and reducing unnecessary clutter.
movies_data = movies_data[required_column]

In [12]:
# Rename the columns of the DataFrame to improve readability and follow a consistent naming convention.
movies_data=movies_data.rename(columns={"title":"Title","year" :"Year" , "certificate" :"Certificate", 
                                        "duration" :"Duration","genre":"Genre","rating":"Rating",
                                       "description" :"Description", "votes" :"Votes"})
movies_data.head()

Unnamed: 0,Title,Genre,Rating,Year,Description,Votes,Certificate,Duration
0,Cobra Kai,"Action, Comedy, Drama",8.5,(2018– ),Decades after their 1984 All Valley Karate Tou...,177031,TV-14,30 min
1,The Crown,"Biography, Drama, History",8.7,(2016– ),Follows the political rivalries and romance of...,199885,TV-MA,58 min
2,Better Call Saul,"Crime, Drama",8.9,(2015–2022),The trials and tribulations of criminal lawyer...,501384,TV-MA,46 min
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,(2022),When a psychiatrist shelters a mysterious cult...,9773,TV-MA,356 min
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure",8.6,(2022– ),A Street Kid trying to survive in a technology...,15413,TV-MA,24 min


# Clean year column 

In [13]:
# Convert 'year' to numeric, extracting only the first 4 digits (year) and converting to integer
movies_data['Year'] = movies_data['Year'].str.extract(r'(\d{4})').astype(float).astype('Int64')

In [14]:
movies_data.dtypes

Title           object
Genre           object
Rating         float64
Year             Int64
Description     object
Votes           object
Certificate     object
Duration        object
dtype: object

In [15]:
movies_data.head()

Unnamed: 0,Title,Genre,Rating,Year,Description,Votes,Certificate,Duration
0,Cobra Kai,"Action, Comedy, Drama",8.5,2018,Decades after their 1984 All Valley Karate Tou...,177031,TV-14,30 min
1,The Crown,"Biography, Drama, History",8.7,2016,Follows the political rivalries and romance of...,199885,TV-MA,58 min
2,Better Call Saul,"Crime, Drama",8.9,2015,The trials and tribulations of criminal lawyer...,501384,TV-MA,46 min
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,2022,When a psychiatrist shelters a mysterious cult...,9773,TV-MA,356 min
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure",8.6,2022,A Street Kid trying to survive in a technology...,15413,TV-MA,24 min


# Find and Remove Duplicates

In [16]:
# Check duplicates values
movies_data.duplicated().sum()

12

In [17]:
# Drop Duplicates
movies_data.drop_duplicates(inplace = True)

In [18]:
movies_data.duplicated().sum()

0

# Handle missing values

In [19]:
# Check null values
movies_data.isnull().sum()

Title             0
Genre            73
Rating         1167
Year            631
Description       0
Votes          1167
Certificate    3445
Duration       2024
dtype: int64

## Fill null values

In [20]:
# Fill missing ratings with the mean
movies_data['Rating'] = movies_data['Rating'].fillna(movies_data['Rating'].mean())

In [23]:
# Fill missing genres with 'Unknown'
movies_data['Genre'] = movies_data['Genre'].fillna('Unknown')

In [24]:
# unique_genres


In [25]:
# Fill missing values using the next row's value
movies_data['Year'] = movies_data['Year'].fillna(method='bfill')

In [26]:
# Clean 'votes' column by removing commas and converting to numeric
movies_data['Votes'] = movies_data['Votes'].fillna(0).astype(str).str.replace(',', '').astype(int)


In [27]:
# Convert the 'Duration' column to string before applying .str.extract()
movies_data['Duration'] = (movies_data['Duration']
        .astype(str)  # Convert to string type to ensure .str accessor works
        .str.extract(r'(\d+)')  # Extract the numerical values from the string
        .astype(float)  # Convert the extracted values to float
        .fillna(movies_data['Duration'].astype(str).str.extract(r'(\d+)').astype(float).median())  # Fill NaN with the median value
        .astype(int)  # Convert to integer after filling NaN values
)


In [28]:
# Replace missing values in the 'Certificate' column with 'Unrated'
movies_data['Certificate'] = movies_data['Certificate'].fillna('Unrated')

In [29]:
movies_data.isnull().sum()

Title          0
Genre          0
Rating         0
Year           0
Description    0
Votes          0
Certificate    0
Duration       0
dtype: int64

In [30]:
movies_data.dtypes


Title           object
Genre           object
Rating         float64
Year             Int64
Description     object
Votes            int32
Certificate     object
Duration         int32
dtype: object

In [31]:
movies_data.shape

(9945, 8)

In [32]:
movies_data.duplicated().sum()

0

In [33]:
movies_data.head()

Unnamed: 0,Title,Genre,Rating,Year,Description,Votes,Certificate,Duration
0,Cobra Kai,"Action, Comedy, Drama",8.5,2018,Decades after their 1984 All Valley Karate Tou...,177031,TV-14,30
1,The Crown,"Biography, Drama, History",8.7,2016,Follows the political rivalries and romance of...,199885,TV-MA,58
2,Better Call Saul,"Crime, Drama",8.9,2015,The trials and tribulations of criminal lawyer...,501384,TV-MA,46
3,Devil in Ohio,"Drama, Horror, Mystery",5.9,2022,When a psychiatrist shelters a mysterious cult...,9773,TV-MA,356
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure",8.6,2022,A Street Kid trying to survive in a technology...,15413,TV-MA,24


# Check and Remove Special Character


In [34]:
## Define a regex pattern to identify special characters
special_char_pattern = r'[^a-zA-Z0-9\s]'

# Remove special characters from all string columns
movies_data = movies_data.applymap(lambda x: re.sub(special_char_pattern, '', str(x)) if isinstance(x, str) else x
)

movies_data.head()

Unnamed: 0,Title,Genre,Rating,Year,Description,Votes,Certificate,Duration
0,Cobra Kai,Action Comedy Drama,8.5,2018,Decades after their 1984 All Valley Karate Tou...,177031,TV14,30
1,The Crown,Biography Drama History,8.7,2016,Follows the political rivalries and romance of...,199885,TVMA,58
2,Better Call Saul,Crime Drama,8.9,2015,The trials and tribulations of criminal lawyer...,501384,TVMA,46
3,Devil in Ohio,Drama Horror Mystery,5.9,2022,When a psychiatrist shelters a mysterious cult...,9773,TVMA,356
4,Cyberpunk Edgerunners,Animation Action Adventure,8.6,2022,A Street Kid trying to survive in a technology...,15413,TVMA,24


In [35]:
columns_to_clean = ['Title']
# Apply the cleaning operation to the specified column(s)
movies_data[columns_to_clean] = movies_data[columns_to_clean].applymap(lambda x: 
    re.sub(special_char_pattern, '', str(x)) if isinstance(x, str) else x
)
movies_data.head()

Unnamed: 0,Title,Genre,Rating,Year,Description,Votes,Certificate,Duration
0,Cobra Kai,Action Comedy Drama,8.5,2018,Decades after their 1984 All Valley Karate Tou...,177031,TV14,30
1,The Crown,Biography Drama History,8.7,2016,Follows the political rivalries and romance of...,199885,TVMA,58
2,Better Call Saul,Crime Drama,8.9,2015,The trials and tribulations of criminal lawyer...,501384,TVMA,46
3,Devil in Ohio,Drama Horror Mystery,5.9,2022,When a psychiatrist shelters a mysterious cult...,9773,TVMA,356
4,Cyberpunk Edgerunners,Animation Action Adventure,8.6,2022,A Street Kid trying to survive in a technology...,15413,TVMA,24


In [36]:
movies_data.to_csv('cleaned_n_moviess.csv', index=False)