In [1]:
import numpy as np
import pandas as pd
import glob

### Combining csv files to get one csv

In [2]:
csv_files_path = 'Movie_data_*.csv'
csv_files = glob.glob(csv_files_path)
csv_files

['Movie_data_2.csv',
 'Movie_data_3.csv',
 'Movie_data_5.csv',
 'Movie_data_1.csv',
 'Movie_data_4.csv']

In [3]:
dataframes = []
for file in csv_files:
    df = pd.read_csv(file,encoding='UTF8')
    dataframes.append(df)

In [4]:
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file if desired
combined_df.to_csv('Movie_data.csv', index=False)
combined_df.head()

Unnamed: 0,Title,Release year,Genre,Duration,Rating,Viewership Certificate,User votes,Plot synopsis,Director,Poster Link
0,Godzilla vs. Mechagodzilla,1974,"['Animation', 'Action', 'Adventure']",1h 24m,6.2,PG,8K,An Okinawan prophecy appears to foretell Earth...,Jun Fukuda,https://m.media-amazon.com/images/M/MV5BOTEzMD...
1,Respect,2021,"['Biography', 'Drama', 'Music']",2h 25m,6.6,PG-13,18K,Following the rise of Aretha Franklin's career...,Liesl Tommy,https://m.media-amazon.com/images/M/MV5BZWQxOD...
2,A New Leaf,1971,"['Comedy', 'Romance']",1h 42m,7.3,G,7.7K,Henry Graham lives the life of a playboy. When...,Elaine May,https://m.media-amazon.com/images/M/MV5BOGVjZW...
3,Bamboozled,2000,"['Comedy', 'Drama', 'Music']",2h 15m,6.7,R,12K,A frustrated African-American TV writer propos...,Spike Lee,https://m.media-amazon.com/images/M/MV5BMTI2OD...
4,A Cowgirl's Story,2017,"['Drama', 'Family']",1h 38m,5.6,PG,706,Dusty Rhodes (Bailee Madison) & her grandfathe...,Timothy Armstrong,https://m.media-amazon.com/images/M/MV5BNjQwYW...


## Loading the new combined csv into a dataframe

In [5]:
df1 = pd.read_csv("./Movie_data.csv")
df1.head()

Unnamed: 0,Title,Release year,Genre,Duration,Rating,Viewership Certificate,User votes,Plot synopsis,Director,Poster Link
0,Godzilla vs. Mechagodzilla,1974,"['Animation', 'Action', 'Adventure']",1h 24m,6.2,PG,8K,An Okinawan prophecy appears to foretell Earth...,Jun Fukuda,https://m.media-amazon.com/images/M/MV5BOTEzMD...
1,Respect,2021,"['Biography', 'Drama', 'Music']",2h 25m,6.6,PG-13,18K,Following the rise of Aretha Franklin's career...,Liesl Tommy,https://m.media-amazon.com/images/M/MV5BZWQxOD...
2,A New Leaf,1971,"['Comedy', 'Romance']",1h 42m,7.3,G,7.7K,Henry Graham lives the life of a playboy. When...,Elaine May,https://m.media-amazon.com/images/M/MV5BOGVjZW...
3,Bamboozled,2000,"['Comedy', 'Drama', 'Music']",2h 15m,6.7,R,12K,A frustrated African-American TV writer propos...,Spike Lee,https://m.media-amazon.com/images/M/MV5BMTI2OD...
4,A Cowgirl's Story,2017,"['Drama', 'Family']",1h 38m,5.6,PG,706,Dusty Rhodes (Bailee Madison) & her grandfathe...,Timothy Armstrong,https://m.media-amazon.com/images/M/MV5BNjQwYW...


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45885 entries, 0 to 45884
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   45884 non-null  object 
 1   Release year            41884 non-null  object 
 2   Genre                   41270 non-null  object 
 3   Duration                31170 non-null  object 
 4   Rating                  31079 non-null  float64
 5   Viewership Certificate  31079 non-null  object 
 6   User votes              31079 non-null  object 
 7   Plot synopsis           31064 non-null  object 
 8   Director                31073 non-null  object 
 9   Poster Link             30259 non-null  object 
dtypes: float64(1), object(9)
memory usage: 3.5+ MB


## Data Cleansing

### Removing null values:

In [7]:
df1.isna().sum()

Title                         1
Release year               4001
Genre                      4615
Duration                  14715
Rating                    14806
Viewership Certificate    14806
User votes                14806
Plot synopsis             14821
Director                  14812
Poster Link               15626
dtype: int64

In [8]:
df1.dropna(inplace=True,ignore_index=True)

In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30248 entries, 0 to 30247
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   30248 non-null  object 
 1   Release year            30248 non-null  object 
 2   Genre                   30248 non-null  object 
 3   Duration                30248 non-null  object 
 4   Rating                  30248 non-null  float64
 5   Viewership Certificate  30248 non-null  object 
 6   User votes              30248 non-null  object 
 7   Plot synopsis           30248 non-null  object 
 8   Director                30248 non-null  object 
 9   Poster Link             30248 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.3+ MB


In [10]:
df1['Plot synopsis'].isna().sum()

0

### Removing duplicate values:

In [11]:
## Considering plot synopsis as it is unique for every movie while movie titles might be same for some movies.
df1['Plot synopsis'].is_unique

False

In [12]:
## Checking duplicates
duplicates = df1[df1.duplicated(subset=['Plot synopsis'], keep=False)]
print(duplicates)

                                     Title Release year  \
90                Where the Red Fern Grows         1974   
364    Superman II: The Richard Donner Cut         1980   
1710                               Macbeth         1948   
1917              Where the Red Fern Grows         2003   
2876               Secrets in the Building         2022   
...                                    ...          ...   
30211                            Mama Duck         1984   
30221                            Rebellion         2012   
30226                           Gallavants         2017   
30227                       The Dawnseeker         2010   
30247                            Echo Base         1951   

                                    Genre Duration  Rating  \
90                    ['Drama', 'Family']   1h 37m     6.9   
364    ['Action', 'Adventure', 'Romance']   1h 55m     7.6   
1710          ['Drama', 'History', 'War']   1h 47m     7.4   
1917                  ['Drama', 'Family']  

In [13]:
# Removing duplicate:
df1['Plot synopsis'] = df1['Plot synopsis'].str.strip()
df_new = df1.drop_duplicates(subset=['Plot synopsis'],ignore_index=True)
print(df_new['Plot synopsis'].is_unique)

True


In [14]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29140 entries, 0 to 29139
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   29140 non-null  object 
 1   Release year            29140 non-null  object 
 2   Genre                   29140 non-null  object 
 3   Duration                29140 non-null  object 
 4   Rating                  29140 non-null  float64
 5   Viewership Certificate  29140 non-null  object 
 6   User votes              29140 non-null  object 
 7   Plot synopsis           29140 non-null  object 
 8   Director                29140 non-null  object 
 9   Poster Link             29140 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.2+ MB


### Modifying the user votes as it is in object format:

In [15]:
def modify_user_votes(x):
    if x[-1] == 'K':
        return float(x[:-1]) * 1000
    elif x[-1] == 'M':
        return float(x[:-1])*1000000
    else:
        return float(x)

df_new['User votes'] = df_new['User votes'].apply(modify_user_votes)
df_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['User votes'] = df_new['User votes'].apply(modify_user_votes)


Unnamed: 0,Title,Release year,Genre,Duration,Rating,Viewership Certificate,User votes,Plot synopsis,Director,Poster Link
0,Godzilla vs. Mechagodzilla,1974,"['Animation', 'Action', 'Adventure']",1h 24m,6.2,PG,8000.0,An Okinawan prophecy appears to foretell Earth...,Jun Fukuda,https://m.media-amazon.com/images/M/MV5BOTEzMD...
1,Respect,2021,"['Biography', 'Drama', 'Music']",2h 25m,6.6,PG-13,18000.0,Following the rise of Aretha Franklin's career...,Liesl Tommy,https://m.media-amazon.com/images/M/MV5BZWQxOD...
2,A New Leaf,1971,"['Comedy', 'Romance']",1h 42m,7.3,G,7700.0,Henry Graham lives the life of a playboy. When...,Elaine May,https://m.media-amazon.com/images/M/MV5BOGVjZW...
3,Bamboozled,2000,"['Comedy', 'Drama', 'Music']",2h 15m,6.7,R,12000.0,A frustrated African-American TV writer propos...,Spike Lee,https://m.media-amazon.com/images/M/MV5BMTI2OD...
4,A Cowgirl's Story,2017,"['Drama', 'Family']",1h 38m,5.6,PG,706.0,Dusty Rhodes (Bailee Madison) & her grandfathe...,Timothy Armstrong,https://m.media-amazon.com/images/M/MV5BNjQwYW...


In [16]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29140 entries, 0 to 29139
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   29140 non-null  object 
 1   Release year            29140 non-null  object 
 2   Genre                   29140 non-null  object 
 3   Duration                29140 non-null  object 
 4   Rating                  29140 non-null  float64
 5   Viewership Certificate  29140 non-null  object 
 6   User votes              29140 non-null  float64
 7   Plot synopsis           29140 non-null  object 
 8   Director                29140 non-null  object 
 9   Poster Link             29140 non-null  object 
dtypes: float64(2), object(8)
memory usage: 2.2+ MB


## Label encoding for genre:

In [17]:
df_new['Genre'] = df_new['Genre'].str.strip("[]").str.replace("'","")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Genre'] = df_new['Genre'].str.strip("[]").str.replace("'","")


In [18]:
all_genres = [
    'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 
    'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 
    'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'
]

In [19]:
df_encoded = df_new['Genre'].str.get_dummies(sep=',')
for genre in all_genres:
    if genre not in df_encoded.columns:
        df_encoded[genre] = 0
df_encoded = df_encoded[all_genres]
df_encoded

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29136,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29137,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29138,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
df_final = pd.concat([df_new, df_encoded], axis=1, ignore_index=False)
df_final.head()

Unnamed: 0,Title,Release year,Genre,Duration,Rating,Viewership Certificate,User votes,Plot synopsis,Director,Poster Link,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Godzilla vs. Mechagodzilla,1974,"Animation, Action, Adventure",1h 24m,6.2,PG,8000.0,An Okinawan prophecy appears to foretell Earth...,Jun Fukuda,https://m.media-amazon.com/images/M/MV5BOTEzMD...,...,0,0,0,0,0,0,0,0,0,0
1,Respect,2021,"Biography, Drama, Music",2h 25m,6.6,PG-13,18000.0,Following the rise of Aretha Franklin's career...,Liesl Tommy,https://m.media-amazon.com/images/M/MV5BZWQxOD...,...,0,0,0,0,0,0,0,0,0,0
2,A New Leaf,1971,"Comedy, Romance",1h 42m,7.3,G,7700.0,Henry Graham lives the life of a playboy. When...,Elaine May,https://m.media-amazon.com/images/M/MV5BOGVjZW...,...,0,0,0,0,0,0,0,0,0,0
3,Bamboozled,2000,"Comedy, Drama, Music",2h 15m,6.7,R,12000.0,A frustrated African-American TV writer propos...,Spike Lee,https://m.media-amazon.com/images/M/MV5BMTI2OD...,...,0,0,0,0,0,0,0,0,0,0
4,A Cowgirl's Story,2017,"Drama, Family",1h 38m,5.6,PG,706.0,Dusty Rhodes (Bailee Madison) & her grandfathe...,Timothy Armstrong,https://m.media-amazon.com/images/M/MV5BNjQwYW...,...,0,0,0,0,0,0,0,0,0,0


In [23]:
## Saving to a csv:
df_final.to_csv("./Filtered_data.csv")

In [22]:
## Required Dataframe for model:
df = df_final.drop(['Release year','Genre','Duration','Viewership Certificate','Plot synopsis','Director','Poster Link'],axis=1)
df.head()

Unnamed: 0,Title,Rating,User votes,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,Godzilla vs. Mechagodzilla,6.2,8000.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Respect,6.6,18000.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A New Leaf,7.3,7700.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Bamboozled,6.7,12000.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A Cowgirl's Story,5.6,706.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
