In [1]:
# importing libraries 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [2]:
# importing dataset 

dataset = pd.read_csv('movies.csv')

dataset.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [3]:
# last five columns of the dataset 

dataset.tail()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
9994,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,
9995,Arcane,(2021– ),"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,
9996,Heart of Invictus,(2022– ),"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,
9997,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,
9998,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,


In [4]:
# info about the dataset 

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


In [5]:
# shape of the dataframe

dataset.shape

(9999, 9)

In [6]:
# length of dataset 

len(dataset)


9999

## Working with the YEAR column

In [7]:
# non-null count

dataset['YEAR'].notna().sum()

9355

In [8]:
# null count

dataset['YEAR'].isna().sum()

644

In [9]:
# making a copy of the year column 

dataset['Year_Cleaned'] = dataset['YEAR'].copy()

In [10]:
# removing the parentheses]

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].str.replace(r'[()]', '', regex= True)

In [11]:
# extra spaces 

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].str.strip()

In [12]:
# removing all extra spaces before and after the dashes 

dataset['Year_Cleaned']= dataset['Year_Cleaned'].str.replace(' ', '', regex= False)

In [13]:
# removing the standalone dashes before the years 

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].str.replace(r'^–', '', regex= True)

In [14]:
# removing the standalone dashes after the years 

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].str.replace(r'–$', '', regex= True)

In [15]:
dataset['Year_Cleaned'].str.strip()

0            2021
1            2021
2       2010–2022
3            2013
4            2021
          ...    
9994         2021
9995         2021
9996         2022
9997         2021
9998         2021
Name: Year_Cleaned, Length: 9999, dtype: object

In [25]:
# handling blanks/ none cells 

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].replace([' ', 'None', None], np.nan)

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].replace(r'^\s*$', np.nan, regex= True)

In [34]:
dataset['Year_Cleaned'].isna().sum()

0

In [33]:
# fiiling the NaN with a placeholder value 

dataset['Year_Cleaned'] = dataset['Year_Cleaned'].fillna('Unknown')

In [36]:
dataset['Year_Cleaned']

0            2021
1            2021
2       2010–2022
3            2013
4            2021
          ...    
9994         2021
9995         2021
9996         2022
9997         2021
9998         2021
Name: Year_Cleaned, Length: 9999, dtype: object

In [None]:
# checking the columns 

dataset.columns

Index(['MOVIES', 'YEAR', 'GENRE', 'RATING', 'ONE-LINE', 'STARS', 'VOTES',
       'RunTime', 'Gross', 'Year_Cleaned'],
      dtype='object')

## Working with the GENRE Column

In [None]:
# making a copy of the genre column

dataset['Genre_Cleaned'] = dataset['GENRE'].copy()

In [49]:
# removing the newline characters 

dataset['Genre_Cleaned'] = dataset['Genre_Cleaned'].replace('\n', '', regex= False)

In [50]:
# stripping the extra spaces 

dataset['Genre_Cleaned'] = dataset['Genre_Cleaned'].str.strip()

In [53]:
# handling missing values 

dataset['Genre_Cleaned'] = dataset['Genre_Cleaned'].replace(' ', np.nan)

In [63]:
# replacing the nan with unknown 

dataset['Genre_Cleaned'] = dataset['Genre_Cleaned'].fillna('Unknown')

In [64]:
# checking the results 

dataset['Genre_Cleaned'].head()

dataset['Genre_Cleaned'].isna().sum()

0

## Working with the RATING Column

In [None]:
# checking the basic statistics 

dataset['RATING'].describe()

count    8179.000000
mean        6.921176
std         1.220232
min         1.100000
25%         6.200000
50%         7.100000
75%         7.800000
max         9.900000
Name: RATING, dtype: float64

In [None]:
# checking the count of missing values 

dataset['RATING'].isna().sum()

1820

In [69]:
# checking the range of ratings 

dataset['RATING'].min()

1.1

In [70]:
dataset['RATING'].max()

9.9

In [71]:
# making the copy of the rating column 

dataset['Rating_Cleaned'] = dataset['RATING'].copy()

In [73]:
# handling the missing values 

# filling with NaN 

dataset['Rating_Cleaned']= dataset['Rating_Cleaned'].replace('', np.nan)

In [75]:
dataset['Rating_Cleaned'].isna().sum()

1820

## Working with VOTES Column

In [82]:
# making the copy of the votes column

dataset['Votes_Cleaned'] = dataset['VOTES'].copy()

In [83]:
# removing the commas 

dataset['Votes_Cleaned'] = dataset['Votes_Cleaned'].str.replace(',', '', regex= False)

In [None]:
# coverting to numeric from object dtype 

# errors = 'coerce' converts any non- numeric values to NaN including the existing NaN

dataset['Votes_Cleaned'] = pd.to_numeric(dataset['Votes_Cleaned'], errors = 'coerce')

In [None]:
# verifying the conversion 

dataset['Votes_Cleaned'].dtype

dtype('float64')

In [89]:
dataset['Votes_Cleaned'].head(10)

0     21062.0
1     17870.0
2    885805.0
3    414849.0
4         NaN
5     25858.0
6      5283.0
7    665387.0
8     34530.0
9     27279.0
Name: Votes_Cleaned, dtype: float64

In [90]:
dataset['Votes_Cleaned'].isna().sum()

1820

## Working with RunTime Column

In [95]:
dataset['RunTime']

0       121.0
1        25.0
2        44.0
3        23.0
4         NaN
        ...  
9994      NaN
9995      NaN
9996      NaN
9997      NaN
9998      NaN
Name: RunTime, Length: 9999, dtype: float64

In [96]:
# checking basic statistics 

dataset['RunTime'].describe()

count    7041.000000
mean       68.688539
std        47.258056
min         1.000000
25%        36.000000
50%        60.000000
75%        95.000000
max       853.000000
Name: RunTime, dtype: float64

In [97]:
dataset['RunTime'].min()

1.0

In [98]:
dataset['RunTime'].max()

853.0

## Working with GROSS Column

In [107]:
# making the copy of the column 

dataset['Gross_Cleaned'] = dataset['Gross'].copy()

In [108]:
# removing the dollar sign 

dataset['Gross_Cleaned']= dataset['Gross_Cleaned'].str.replace('$','', regex= False)

In [109]:
# removing the M and coverting to actual numbers 

dataset['Gross_Cleaned'] = dataset['Gross_Cleaned'].str.replace('M', '', regex= False)

In [110]:
# converting to numeric 

dataset['Gross_Cleaned'] =pd.to_numeric(dataset['Gross_Cleaned'], errors='coerce')

In [111]:
# multiplying by 1,000,000 to get actaul dollar values 

dataset['Gross_Cleaned'] = dataset['Gross_Cleaned'] * 1000000

In [117]:
# verifying the result 

dataset['Gross_Cleaned'].head(100)

0            NaN
1            NaN
2            NaN
3            NaN
4            NaN
         ...    
95    89220000.0
96           NaN
97           NaN
98           NaN
99           NaN
Name: Gross_Cleaned, Length: 100, dtype: float64

In [118]:
dataset['Gross_Cleaned'].dtype

dtype('float64')

In [119]:
dataset['Gross_Cleaned'].isna().sum()

9539

## Working with ONE-LINE Column

In [None]:
dataset['ONE-LINE'].head(10)

0    \nA woman with a mysterious illness is forced ...
1    \nThe war for Eternia begins again in what may...
2    \nSheriff Deputy Rick Grimes wakes up from a c...
3    \nAn animated series that follows the exploits...
4    \nA prequel, set before the events of Army of ...
5    \nA group of teenagers from the wrong side of ...
6    \nA pair of interwoven stories set in the past...
7    \nBy day, mild-mannered Dexter is a blood-spat...
8    \nThe complicated life of a modern-day first g...
9    \nSeeking a fresh start, nurse practitioner Me...
Name: ONE-LINE, dtype: object

In [None]:
# checking for missing values 

dataset['ONE-LINE'].isna().sum()

0

In [None]:
# checking for empty strings 

(dataset['ONE-LINE'] == ' ').sum()

0

In [127]:
# checking for whitespaced only strings 

dataset['ONE-LINE'].str.strip().eq('').sum()

0

In [128]:
# making a copy of the column 

dataset['OneLine_Cleaned'] = dataset['ONE-LINE'].copy()

In [129]:
# removing the newline characters 

dataset['OneLine_Cleaned'] = dataset['OneLine_Cleaned'].str.replace('\n', '', regex= False)

In [130]:
# removing extra spaces 

dataset['OneLine_Cleaned'] = dataset['OneLine_Cleaned'].str.strip()

In [132]:
# replacing the empty strings if any 

dataset['OneLine_Cleaned'] = dataset['OneLine_Cleaned'].replace('', np.nan)

In [133]:
# checking the result

dataset['OneLine_Cleaned'].head()

0    A woman with a mysterious illness is forced in...
1    The war for Eternia begins again in what may b...
2    Sheriff Deputy Rick Grimes wakes up from a com...
3    An animated series that follows the exploits o...
4    A prequel, set before the events of Army of th...
Name: OneLine_Cleaned, dtype: object

In [134]:
dataset['OneLine_Cleaned'].isna().sum()

0