# Netflix Viewer Ratings: Data Cleaning and Preparation for Movie Recommendations

### Importing Libraries

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the Dataset

In [69]:
# using "lineterminator='\n'" used to terminate lines when reading non-standard CSV files.
# line terminators inconsistent ot pandas may misinterpret the file's structure.
df = pd.read_csv(r'C:\Users\shekharram\OneDrive\Desktop\jupyter projects\iScale\mymoviedb.csv',lineterminator='\n')
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


### Basic Information and Overview

####  Displaying the first few rows of the dataset:

In [70]:
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


#### Displaying the last few rows:

In [71]:
df.tail()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
9822,1973-10-15,Badlands,A dramatization of the Starkweather-Fugate kil...,13.357,896,7.6,en,"Drama, Crime",https://image.tmdb.org/t/p/original/z81rBzHNgi...
9823,2020-10-01,Violent Delights,A female vampire falls in love with a man she ...,13.356,8,3.5,es,Horror,https://image.tmdb.org/t/p/original/4b6HY7rud6...
9824,2016-05-06,The Offering,When young and successful reporter Jamie finds...,13.355,94,5.0,en,"Mystery, Thriller, Horror",https://image.tmdb.org/t/p/original/h4uMM1wOhz...
9825,2021-03-31,The United States vs. Billie Holiday,Billie Holiday spent much of her career being ...,13.354,152,6.7,en,"Music, Drama, History",https://image.tmdb.org/t/p/original/vEzkxuE2sJ...
9826,1984-09-23,Threads,Documentary style account of a nuclear holocau...,13.354,186,7.8,en,"War, Drama, Science Fiction",https://image.tmdb.org/t/p/original/lBhU4U9Eeh...


#### Dataset's shape (rows and columns):

In [72]:
df.shape

(9827, 9)

#### Columns in the dataset

In [73]:
df.columns

Index(['Release_Date', 'Title', 'Overview', 'Popularity', 'Vote_Count',
       'Vote_Average', 'Original_Language', 'Genre', 'Poster_Url'],
      dtype='object')

### Cheking dataset's Data Types and Summary

#### Data types of columns values

In [74]:
df.dtypes

Release_Date          object
Title                 object
Overview              object
Popularity           float64
Vote_Count             int64
Vote_Average         float64
Original_Language     object
Genre                 object
Poster_Url            object
dtype: object

#### Summary statistics for numerical columns

In [75]:
df.describe()

Unnamed: 0,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0
mean,40.326088,1392.805536,6.439534
std,108.873998,2611.206907,1.129759
min,13.354,0.0,0.0
25%,16.1285,146.0,5.9
50%,21.199,444.0,6.5
75%,35.1915,1376.0,7.1
max,5083.954,31077.0,10.0


#### Detailed summary including non-numerical columns

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


##### exploring 'Genre' column

In [77]:
df['Genre'].head()

0    Action, Adventure, Science Fiction
1              Crime, Mystery, Thriller
2                              Thriller
3    Animation, Comedy, Family, Fantasy
4      Action, Adventure, Thriller, War
Name: Genre, dtype: object

## Exploring Data Quality

#### Detecting duplicate rows 

In [78]:
df.duplicated().sum()

np.int64(0)

#### Checking for missing or null values 

In [79]:
df.isnull().sum()

Release_Date         0
Title                0
Overview             0
Popularity           0
Vote_Count           0
Vote_Average         0
Original_Language    0
Genre                0
Poster_Url           0
dtype: int64

### Exploration Summary

In [80]:
#taking a look on Numaric Statistics for further action
df.describe()

Unnamed: 0,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0
mean,40.326088,1392.805536,6.439534
std,108.873998,2611.206907,1.129759
min,13.354,0.0,0.0
25%,16.1285,146.0,5.9
50%,21.199,444.0,6.5
75%,35.1915,1376.0,7.1
max,5083.954,31077.0,10.0


1.The dataframe contains 9,827 rows and 9 columns.
2.The dataset appears clean, with no missing (NaN) or duplicate values.
3.The Release_Date column should be converted to datetime format, and relevant information extracted.
4.The Overview, Original_Language, and Poster-URL columns may not be particularly useful for analysis.
5.There are noticeable outliers in the Popularity column.
6.Categorizing Vote_Average would facilitate more effective analysis.
7.The Genre column contains comma-separated values and extra white spaces that need to be addressed.

# Data Preprocessing and Cleaning

In [81]:
#Converting the Release_Date column (object) to datetime datatyye. 

df['Release_Date'] = pd.to_datetime(df['Release_Date'])


In [82]:
# Extracting the year component from Release_Date column.

df['Release_Date'] = df['Release_Date'].dt.year

# Now datatypethe of 'year' is (int) 

df['Release_Date'].dtypes

dtype('int32')

In [83]:
# Checking changes made 

df['Release_Date'].head()

0    2021
1    2022
2    2022
3    2021
4    2021
Name: Release_Date, dtype: int32

#### Removing specified columns from the dataframe and verifying the updated column list.

In [84]:
df.columns

Index(['Release_Date', 'Title', 'Overview', 'Popularity', 'Vote_Count',
       'Vote_Average', 'Original_Language', 'Genre', 'Poster_Url'],
      dtype='object')

In [85]:
# Specifying the columns to drop

drop_cols =['Overview', 'Original_Language', 'Poster_Url']
# Dropping Columns

df.drop(drop_cols, axis=1 , inplace= True)

# Verifying the updated column list

df.columns

Index(['Release_Date', 'Title', 'Popularity', 'Vote_Count', 'Vote_Average',
       'Genre'],
      dtype='object')

#### Updated 

In [86]:
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,2021,Spider-Man: No Way Home,5083.954,8940,8.3,"Action, Adventure, Science Fiction"
1,2022,The Batman,3827.658,1151,8.1,"Crime, Mystery, Thriller"
2,2022,No Exit,2618.087,122,6.3,Thriller
3,2021,Encanto,2402.201,5076,7.7,"Animation, Comedy, Family, Fantasy"
4,2021,The King's Man,1895.511,1793,7.0,"Action, Adventure, Thriller, War"
