In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# import data
df = pd.read_csv('/kaggle/input/movies-dataset-for-feature-extracion-prediction/movies.csv')
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


---
After rechecked the data, I've found that many of records are duplicated even if some columns are different. For example, the movie named "Rick and Morty" have 10 records in total but data in the MOVIES, YEARS and GENRE columns are the same with difference in others.

I've lookup in the internet and found that the reasons could be, 1 movie has many directors or stars so that the different combinations of directors and stars of that movie are considered as a different movie. 

Most of each duplicated movies have 3 columns in common which are MOVIES, YEARS and GENRE. So, I've decided to check duplicates from only these 3 columns and keep the record that has the most VOTES. The more VOTES can be interpreted as the more people have faced it in the IMDB.

---

In [4]:
# define trim function for string

def trim(s):
    return re.sub('  +', ' ', s).strip() if type(s) == str else s

In [5]:
# drop duplicates

df2 = df.copy()
df2 = df2.replace({np.nan: None})
df2 = df2.applymap(trim)
df2['VOTES'] = df2['VOTES'].apply(lambda x: int(x.replace(',', '')) if isinstance(x, (str)) else x)
df2 = df2.sort_values(by='VOTES', ascending=False)
df2 = df2.drop_duplicates(subset=['MOVIES', 'YEAR', 'GENRE'])
df2 = df2.sort_index().reset_index(drop=True)
df2

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,Director:\nPeter Thorwarth\n| \n Stars:\nPeri ...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"Stars:\nChris Wood, \nSarah Michelle Gellar, \...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:\nAndrew Lincoln, \nNorman Reedus, \nMel...",885805.0,44.0,
3,Rick and Morty,(2013– ),"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"Stars:\nJustin Roiland, \nChris Parnell, \nSpe...",414849.0,23.0,
4,Army of Thieves,(2021),"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",Director:\nMatthias Schweighöfer\n| \n Stars:\...,,,
...,...,...,...,...,...,...,...,...,...
6493,The Talk,(2010– ),Talk-Show,,Actor Ralph Macchio (Cobra Kai (2018));,"Stars:\nSharon Osbourne, \nSheryl Underwood, \...",,38.0,
6494,Kajko i Kokosz,(2021– ),"Animation, Action, Adventure",7.1,Add a Plot,Director:\nMichal Sledzinski\n| \n Stars:\nArt...,34.0,,
6495,God's Favorite Idiot,,Comedy,,Add a Plot,"Stars:\nLeslie Bibb, \nMelissa McCarthy, \nKev...",,,
6496,Astérix,(2023),"Animation, Action, Adventure",,Add a Plot,,,,


---
First, I've noticed that data in the column YEAR are not in the same pattern. From what I've seen, there're 4 patterns of them.
1. 1 Year: ####
2. \> 1 Year: ####–####
3. Ongoing: ####–
4. No Year (Blank data)

It would be great if I split the YEAR column into 2 columns which are "Start Year" and "End Year" to satisfy the concept 1 column 1 meaning. I've decided to leave the "End Year" blank if the movie was still ongoing in the time data was collected.

---

In [6]:
# insert start_year, end_year
# drop YEAR

start_year = []
end_year = []

for y in df2['YEAR']:
    if isinstance(y, (str)):
        y_to_y = re.search('\d{4}–\d{4}', y)
        y_to = re.search('\d{4}–', y)
        yyyy = re.search('\d{4}', y)
        if bool(y_to_y):
            start_year.append(y_to_y.group()[0:4])
            end_year.append(y_to_y.group()[5:9])
        elif bool(y_to):
            start_year.append(y_to.group()[0:-1])
            end_year.append(None)
        elif bool(yyyy):
            start_year.append(yyyy.group())
            end_year.append(yyyy.group())
        else:
            start_year.append(None)
            end_year.append(None)
    else:
        start_year.append(None)
        end_year.append(None)
        
df2.insert(1, 'start_year', start_year)
df2.insert(2, 'end_year', end_year)
df2 = df2.drop(columns=['YEAR'])
df2

Unnamed: 0,MOVIES,start_year,end_year,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,Director:\nPeter Thorwarth\n| \n Stars:\nPeri ...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"Stars:\nChris Wood, \nSarah Michelle Gellar, \...",17870.0,25.0,
2,The Walking Dead,2010,2022,"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:\nAndrew Lincoln, \nNorman Reedus, \nMel...",885805.0,44.0,
3,Rick and Morty,2013,,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"Stars:\nJustin Roiland, \nChris Parnell, \nSpe...",414849.0,23.0,
4,Army of Thieves,2021,2021,"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",Director:\nMatthias Schweighöfer\n| \n Stars:\...,,,
...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,Actor Ralph Macchio (Cobra Kai (2018));,"Stars:\nSharon Osbourne, \nSheryl Underwood, \...",,38.0,
6494,Kajko i Kokosz,2021,,"Animation, Action, Adventure",7.1,Add a Plot,Director:\nMichal Sledzinski\n| \n Stars:\nArt...,34.0,,
6495,God's Favorite Idiot,,,Comedy,,Add a Plot,"Stars:\nLeslie Bibb, \nMelissa McCarthy, \nKev...",,,
6496,Astérix,2023,2023,"Animation, Action, Adventure",,Add a Plot,,,,


In [7]:
# check how many and what Genre there are.
exp_genre = df2['GENRE'].str.split(', ', expand=True).rename(columns={0:'genre1', 1:'genre2', 2:'genre3'})
exp_genre

Unnamed: 0,genre1,genre2,genre3
0,Action,Horror,Thriller
1,Animation,Action,Adventure
2,Drama,Horror,Thriller
3,Animation,Adventure,Comedy
4,Action,Crime,Horror
...,...,...,...
6493,Talk-Show,,
6494,Animation,Action,Adventure
6495,Comedy,,
6496,Animation,Action,Adventure


In [8]:
# check unique genres
exp_genre_unique = exp_genre.stack().reset_index(drop=True).unique()
pd.Series(exp_genre_unique).sort_values().reset_index(drop=True)

0          Action
1       Adventure
2       Animation
3       Biography
4          Comedy
5           Crime
6     Documentary
7           Drama
8          Family
9         Fantasy
10      Film-Noir
11      Game-Show
12        History
13         Horror
14          Music
15        Musical
16        Mystery
17           News
18     Reality-TV
19        Romance
20         Sci-Fi
21          Short
22          Sport
23      Talk-Show
24       Thriller
25            War
26        Western
dtype: object

---
Each movie has at most 3 genres with 27 unique genre types. There are 2 unique genres looking ambiguous which are "musical" and "music". It should be the same so replace them as "musical" and 26 unique genre types left in total. I will one-hot encode for all 26 genres.

---

In [9]:
# replace Music with Musical
exp_genre = exp_genre.replace('Music', 'Musical')
exp_genre

Unnamed: 0,genre1,genre2,genre3
0,Action,Horror,Thriller
1,Animation,Action,Adventure
2,Drama,Horror,Thriller
3,Animation,Adventure,Comedy
4,Action,Crime,Horror
...,...,...,...
6493,Talk-Show,,
6494,Animation,Action,Adventure
6495,Comedy,,
6496,Animation,Action,Adventure


In [10]:
# check unique again
exp_genre_unique = exp_genre.stack().reset_index(drop=True).unique()
exp_genre_unique = pd.Series(exp_genre_unique).sort_values().reset_index(drop=True)
exp_genre_unique

0          Action
1       Adventure
2       Animation
3       Biography
4          Comedy
5           Crime
6     Documentary
7           Drama
8          Family
9         Fantasy
10      Film-Noir
11      Game-Show
12        History
13         Horror
14        Musical
15        Mystery
16           News
17     Reality-TV
18        Romance
19         Sci-Fi
20          Short
21          Sport
22      Talk-Show
23       Thriller
24            War
25        Western
dtype: object

In [11]:
# insert new 3 genre columns and drop current GENRE
df2 = df2.drop(columns=['GENRE'])
df2.insert(3, 'genre1', exp_genre['genre1'])
df2.insert(4, 'genre2', exp_genre['genre2'])
df2.insert(5, 'genre3', exp_genre['genre3'])
df2

Unnamed: 0,MOVIES,start_year,end_year,genre1,genre2,genre3,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,6.1,A woman with a mysterious illness is forced in...,Director:\nPeter Thorwarth\n| \n Stars:\nPeri ...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,5.0,The war for Eternia begins again in what may b...,"Stars:\nChris Wood, \nSarah Michelle Gellar, \...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:\nAndrew Lincoln, \nNorman Reedus, \nMel...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,9.2,An animated series that follows the exploits o...,"Stars:\nJustin Roiland, \nChris Parnell, \nSpe...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,,"A prequel, set before the events of Army of th...",Director:\nMatthias Schweighöfer\n| \n Stars:\...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,,Actor Ralph Macchio (Cobra Kai (2018));,"Stars:\nSharon Osbourne, \nSheryl Underwood, \...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,7.1,Add a Plot,Director:\nMichal Sledzinski\n| \n Stars:\nArt...,34.0,,
6495,God's Favorite Idiot,,,Comedy,,,,Add a Plot,"Stars:\nLeslie Bibb, \nMelissa McCarthy, \nKev...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,,Add a Plot,,,,


In [12]:
# one-hot enconding for genre
res = {}
for genre in exp_genre_unique:
    res[genre] = df2.loc[:, 'genre1':'genre3'].apply(lambda x: genre in x.to_numpy(), axis=1)
res = pd.DataFrame(res)
res

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,True,True,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
6494,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6495,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6496,True,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# insert one-hot encoding after genre3
df2 = pd.concat([df2.loc[:, 'MOVIES':'genre3'], res, df2.loc[:, 'RATING':]], axis=1)
df2

Unnamed: 0,MOVIES,start_year,end_year,genre1,genre2,genre3,Action,Adventure,Animation,Biography,...,Talk-Show,Thriller,War,Western,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,False,True,False,False,6.1,A woman with a mysterious illness is forced in...,Director:\nPeter Thorwarth\n| \n Stars:\nPeri ...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,5.0,The war for Eternia begins again in what may b...,"Stars:\nChris Wood, \nSarah Michelle Gellar, \...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,False,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:\nAndrew Lincoln, \nNorman Reedus, \nMel...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,False,9.2,An animated series that follows the exploits o...,"Stars:\nJustin Roiland, \nChris Parnell, \nSpe...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,False,,"A prequel, set before the events of Army of th...",Director:\nMatthias Schweighöfer\n| \n Stars:\...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,False,False,False,False,...,True,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,"Stars:\nSharon Osbourne, \nSheryl Underwood, \...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,7.1,Add a Plot,Director:\nMichal Sledzinski\n| \n Stars:\nArt...,34.0,,
6495,God's Favorite Idiot,,,Comedy,,,False,False,False,False,...,False,False,False,False,,Add a Plot,"Stars:\nLeslie Bibb, \nMelissa McCarthy, \nKev...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,,Add a Plot,,,,


---
For column ONE-LINE, there are no blank but "Add a Plot" is considered as blank data. So, replace them with the real blank and trim the results.

---

In [14]:
# replace 'Add a Plot' to None
df2['ONE-LINE'] = df2['ONE-LINE'].replace('Add a Plot', None)
df2

Unnamed: 0,MOVIES,start_year,end_year,genre1,genre2,genre3,Action,Adventure,Animation,Biography,...,Talk-Show,Thriller,War,Western,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,False,True,False,False,6.1,A woman with a mysterious illness is forced in...,Director:\nPeter Thorwarth\n| \n Stars:\nPeri ...,21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,5.0,The war for Eternia begins again in what may b...,"Stars:\nChris Wood, \nSarah Michelle Gellar, \...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,False,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:\nAndrew Lincoln, \nNorman Reedus, \nMel...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,False,9.2,An animated series that follows the exploits o...,"Stars:\nJustin Roiland, \nChris Parnell, \nSpe...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,False,,"A prequel, set before the events of Army of th...",Director:\nMatthias Schweighöfer\n| \n Stars:\...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,False,False,False,False,...,True,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,"Stars:\nSharon Osbourne, \nSheryl Underwood, \...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,7.1,,Director:\nMichal Sledzinski\n| \n Stars:\nArt...,34.0,,
6495,God's Favorite Idiot,,,Comedy,,,False,False,False,False,...,False,False,False,False,,,"Stars:\nLeslie Bibb, \nMelissa McCarthy, \nKev...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,False,,,,,,


---
There are at most 2 things in the column STARS, so I've decided to split the data into 2 columns which are directors and starts. There are many combinations of the patterns which are 0, 1, 2+ director(s) and 0, 1, 2+ star(s). Because of a lot of names, I won't split the names into columns. All directors go into 1 column and all stars go into 1 another column as well.

---

In [15]:
# split directors and stars apart
df2['STARS'] = df2['STARS'].replace('\n', '', regex=True).apply(trim)
res = []

for x in df2['STARS']:
    has_director = bool(re.search('Directors?:', x))
    has_star = bool(re.search('Stars?:', x))
    
    if has_director and has_star:
        director, star = re.search('Directors?:(.*)\|.*Stars?:(.*)', x).groups()
        res.append([director, star])
    elif has_director:
        director = re.search('Directors?:(.*)', x).group(1)
        res.append([director, None])
    elif has_star:
        star = re.search('Stars?:(.*)', x).group(1)
        res.append([None, star])
    else:
        res.append([None, None])

director_star = pd.DataFrame(res, columns=['directors', 'stars'])
director_star

Unnamed: 0,directors,stars
0,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc..."
1,,"Chris Wood, Sarah Michelle Gellar, Lena Headey..."
2,,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
3,,"Justin Roiland, Chris Parnell, Spencer Grammer..."
4,Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby..."
...,...,...
6493,,"Sharon Osbourne, Sheryl Underwood, Carrie Ann ..."
6494,Michal Sledzinski,"Artur Pontek, Michal Piela, Maciej Kosmala, Ja..."
6495,,"Leslie Bibb, Melissa McCarthy, Kevin Dunn, Ben..."
6496,,


In [16]:
# drop STARS and insert directors and stars
df2 = df2.drop(columns=['STARS'])
df2.insert(34, 'directors', director_star['directors'])
df2.insert(35, 'stars', director_star['stars'])
df2

Unnamed: 0,MOVIES,start_year,end_year,genre1,genre2,genre3,Action,Adventure,Animation,Biography,...,Thriller,War,Western,RATING,ONE-LINE,directors,stars,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,True,False,False,6.1,A woman with a mysterious illness is forced in...,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc...",21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,5.0,The war for Eternia begins again in what may b...,,"Chris Wood, Sarah Michelle Gellar, Lena Headey...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,,"Andrew Lincoln, Norman Reedus, Melissa McBride...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,9.2,An animated series that follows the exploits o...,,"Justin Roiland, Chris Parnell, Spencer Grammer...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,,"A prequel, set before the events of Army of th...",Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,False,False,False,False,...,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,,"Sharon Osbourne, Sheryl Underwood, Carrie Ann ...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,7.1,,Michal Sledzinski,"Artur Pontek, Michal Piela, Maciej Kosmala, Ja...",34.0,,
6495,God's Favorite Idiot,,,Comedy,,,False,False,False,False,...,False,False,False,,,,"Leslie Bibb, Melissa McCarthy, Kevin Dunn, Ben...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,,,,,,,


In [17]:
# parse Gross to float
df2['Gross'] = df2['Gross'].str.replace('\$|M', '', regex=True).astype(float)
df2

Unnamed: 0,MOVIES,start_year,end_year,genre1,genre2,genre3,Action,Adventure,Animation,Biography,...,Thriller,War,Western,RATING,ONE-LINE,directors,stars,VOTES,RunTime,Gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,True,False,False,6.1,A woman with a mysterious illness is forced in...,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc...",21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,5.0,The war for Eternia begins again in what may b...,,"Chris Wood, Sarah Michelle Gellar, Lena Headey...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,,"Andrew Lincoln, Norman Reedus, Melissa McBride...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,9.2,An animated series that follows the exploits o...,,"Justin Roiland, Chris Parnell, Spencer Grammer...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,,"A prequel, set before the events of Army of th...",Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,False,False,False,False,...,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,,"Sharon Osbourne, Sheryl Underwood, Carrie Ann ...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,7.1,,Michal Sledzinski,"Artur Pontek, Michal Piela, Maciej Kosmala, Ja...",34.0,,
6495,God's Favorite Idiot,,,Comedy,,,False,False,False,False,...,False,False,False,,,,"Leslie Bibb, Melissa McCarthy, Kevin Dunn, Ben...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,,,,,,,


In [18]:
# lowercase the columns' names
df2.columns = [c.lower() for c in df2.columns]
df2

Unnamed: 0,movies,start_year,end_year,genre1,genre2,genre3,action,adventure,animation,biography,...,thriller,war,western,rating,one-line,directors,stars,votes,runtime,gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,True,False,False,6.1,A woman with a mysterious illness is forced in...,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc...",21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,5.0,The war for Eternia begins again in what may b...,,"Chris Wood, Sarah Michelle Gellar, Lena Headey...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,,"Andrew Lincoln, Norman Reedus, Melissa McBride...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,9.2,An animated series that follows the exploits o...,,"Justin Roiland, Chris Parnell, Spencer Grammer...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,,"A prequel, set before the events of Army of th...",Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6493,The Talk,2010,,Talk-Show,,,False,False,False,False,...,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,,"Sharon Osbourne, Sheryl Underwood, Carrie Ann ...",,38.0,
6494,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,7.1,,Michal Sledzinski,"Artur Pontek, Michal Piela, Maciej Kosmala, Ja...",34.0,,
6495,God's Favorite Idiot,,,Comedy,,,False,False,False,False,...,False,False,False,,,,"Leslie Bibb, Melissa McCarthy, Kevin Dunn, Ben...",,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,,,,,,,


In [19]:
# show duplicated movies (subset = ['movies']) to check if it's different in 3 first columns but really different or not.
df2[df2.duplicated(subset=['movies'], keep=False)].sort_values(by='movies')

Unnamed: 0,movies,start_year,end_year,genre1,genre2,genre3,action,adventure,animation,biography,...,thriller,war,western,rating,one-line,directors,stars,votes,runtime,gross
2171,Amend: The Fight for America,2021,2021,Documentary,History,,False,False,False,False,...,False,False,False,7.9,"Will Smith hosts this look at the evolving, of...",,"Will Smith, Bryan Stevenson, Larry Wilmore, Sa...",1068.0,,
5954,Amend: The Fight for America,2021,2021,Documentary,,,False,False,False,False,...,False,False,False,6.2,,,Will Smith,16.0,,
5077,Astérix,,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,,"In the glory days of the Roman Empire, one vil...",,,,,
6496,Astérix,2023,2023,Animation,Action,Adventure,True,True,True,False,...,False,False,False,,,,,,,
128,Avatar: The Last Airbender,2005,2008,Animation,Action,Adventure,True,True,True,False,...,False,False,False,9.3,"In a war-torn world of elemental magic, a youn...",,"Dee Bradley Baker, Zach Tyler, Mae Whitman, Ja...",265845.0,23.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3771,Verónica,2017,2017,Drama,Mystery,Thriller,False,False,False,False,...,True,False,False,5.9,A female Psychologist who has stopped practici...,"Carlos Algara, Alejandro Martinez-Beltran","Olga Segura, Arcelia Ramírez, Sofía Garza, Hor...",1342.0,81.0,
1321,Wanderlust,2018,,Drama,,,False,False,False,False,...,False,False,False,7.1,A therapist tries to save her marriage after a...,,"Toni Collette, Steven Mackintosh, Joe Hurst, E...",4825.0,60.0,
5459,Wanderlust,2006,2006,Documentary,History,,False,False,False,False,...,False,False,False,6.6,A documentary on road movies and their effect ...,"Shari Springer Berman, Robert Pulcini","Allison Anders, Jeanine Basinger, Robert Bento...",122.0,84.0,
1526,Wednesday,,,Comedy,Family,Fantasy,False,False,False,False,...,False,False,False,,A modernized live-action version of 'The Addam...,,Jenna Ortega,,,


In [20]:
# how many number of movies that have duplicates.
print(df2[df2.duplicated(subset=['movies'], keep=False)].sort_values(by='movies')['movies'].unique())
print(df2[df2.duplicated(subset=['movies'], keep=False)].sort_values(by='movies')['movies'].unique().shape)

# there are 69 movies that have duplicates but different in either year or genre 🤔🤔
# it's a small part of movies I've got. So, I've decided to drop these duplicated rows by keeping the most votes one.

['Amend: The Fight for America' 'Astérix' 'Avatar: The Last Airbender'
 'Away' 'Bad Blood' 'Beauty and the Beast' 'Blackout' 'Blood Brother'
 'Bodyguard' 'Boku dake ga inai machi' 'Braqueurs' 'Carmen Sandiego'
 'Daybreak' 'Eden' 'Exatlon Challenge' 'Extracurricular' 'Fearless'
 'Freedom' 'Garbage' 'Glória' 'Hagane no renkinjutsushi'
 'Happiness Ever After' 'Heartbreak High' 'Heist' 'Hit and Run' 'Home'
 'Home for Christmas' 'House Arrest' 'Into the Night' 'Jinn' 'Jonas'
 'Kakegurui' 'Kingdom' 'Ludo' 'Mai' 'Malibu Rescue' 'Maska' 'Matilda'
 'Mob Psycho 100' 'One of Us' 'Painkiller' 'Paranoid' 'Paranormal'
 'Perdida' 'Private Life' 'Revenge' "Rosemary's Baby" 'Safe' 'Security'
 'Sexy Beasts' 'Snowpiercer' 'Sword Art Online' 'Tales of the City'
 'The House' 'The Legend of Cocaine Island' 'The Lincoln Lawyer'
 'The Oscars' 'The Pentaverate' 'The Silence' 'The Stranger' 'The Watcher'
 'The Whole Truth' 'Titans' 'True Story' 'Ultraman' 'Undercover'
 'Verónica' 'Wanderlust' 'Wednesday']
(69,)

In [21]:
# save the 69 movies duplicates version
df2.to_csv('./cleaned_movie_dataset_69_movies_duplicated.csv', index_label='index')

In [22]:
# make unique movies version
df3 = df2.sort_values(by='votes').drop_duplicates(subset=['movies']).sort_index().reset_index(drop=True)
df3

Unnamed: 0,movies,start_year,end_year,genre1,genre2,genre3,action,adventure,animation,biography,...,thriller,war,western,rating,one-line,directors,stars,votes,runtime,gross
0,Blood Red Sky,2021,2021,Action,Horror,Thriller,True,False,False,False,...,True,False,False,6.1,A woman with a mysterious illness is forced in...,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc...",21062.0,121.0,
1,Masters of the Universe: Revelation,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,5.0,The war for Eternia begins again in what may b...,,"Chris Wood, Sarah Michelle Gellar, Lena Headey...",17870.0,25.0,
2,The Walking Dead,2010,2022,Drama,Horror,Thriller,False,False,False,False,...,True,False,False,8.2,Sheriff Deputy Rick Grimes wakes up from a com...,,"Andrew Lincoln, Norman Reedus, Melissa McBride...",885805.0,44.0,
3,Rick and Morty,2013,,Animation,Adventure,Comedy,False,True,True,False,...,False,False,False,9.2,An animated series that follows the exploits o...,,"Justin Roiland, Chris Parnell, Spencer Grammer...",414849.0,23.0,
4,Army of Thieves,2021,2021,Action,Crime,Horror,True,False,False,False,...,False,False,False,,"A prequel, set before the events of Army of th...",Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6418,Son of Abish,2014,,Comedy,Talk-Show,,False,False,False,False,...,False,False,False,,"On this episode, catch Abhishek Bachchan and I...",,"Abish Mathew, Abhishek Bachchan, Inayat Verma,...",,,
6419,Dad Stop Embarrassing Me,2021,2021,Comedy,Family,,False,False,False,False,...,False,False,False,5.7,"After Sasha moves in with Brian, a mortifying ...",Ken Whittingham,"Jamie Foxx, David Alan Grier, Kyla-Drew, Porsc...",98.0,,
6420,The Talk,2010,,Talk-Show,,,False,False,False,False,...,False,False,False,,Actor Ralph Macchio (Cobra Kai (2018));,,"Sharon Osbourne, Sheryl Underwood, Carrie Ann ...",,38.0,
6421,Kajko i Kokosz,2021,,Animation,Action,Adventure,True,True,True,False,...,False,False,False,7.1,,Michal Sledzinski,"Artur Pontek, Michal Piela, Maciej Kosmala, Ja...",34.0,,


In [23]:
# from 6498 rows to 6423 rows which are unique movie names, save this as another csv.
df3.to_csv('./cleaned_movie_dataset_no_duplicates.csv', index_label='index')