In [7]:
import numpy as np
import pandas as pd
import re

In [14]:
movies_df = pd.read_csv("/content/movies.csv")

# Display the first few rows of the dataframe
movies_df.head(), movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1000 non-null   int64  
 1   id            1000 non-null   int64  
 2   Description   1000 non-null   object 
 3   Title         1000 non-null   object 
 4   rating        1000 non-null   float64
 5   runtime-mins  1000 non-null   int64  
 6   Year          1000 non-null   int64  
 7   Genres        1000 non-null   object 
 8   release-date  1000 non-null   object 
 9   Directors     1000 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 78.2+ KB


(   Unnamed: 0  id                               Description  \
 0           0   1  Worldwide Lifetime Gross: $2,923,706,026   
 1           1   2  Worldwide Lifetime Gross: $2,799,439,100   
 2           2   3  Worldwide Lifetime Gross: $2,320,250,281   
 3           3   4  Worldwide Lifetime Gross: $2,264,750,694   
 4           4   5  Worldwide Lifetime Gross: $2,071,310,218   
 
                                         Title  rating  runtime-mins  Year  \
 0                                      Avatar     7.9           162  2009   
 1                           Avengers: Endgame     8.4           181  2019   
 2                    Avatar: The Way of Water     7.5           192  2022   
 3                                     Titanic     7.9           194  1997   
 4  Star Wars: Episode VII - The Force Awakens     7.8           138  2015   
 
                                Genres release-date                 Directors  
 0  Action, Adventure, Fantasy, Sci-Fi   2009-12-18             

In [15]:
# Drop the unnecessary "Unnamed: 0" column
movies_df = movies_df.drop(columns=["Unnamed: 0"])

# Convert 'release-date' to datetime format
movies_df['release-date'] = pd.to_datetime(movies_df['release-date'], format='%Y-%m-%d')

# Strip any leading/trailing whitespace in text columns
movies_df['Description'] = movies_df['Description'].str.strip()
movies_df['Title'] = movies_df['Title'].str.strip()
movies_df['Genres'] = movies_df['Genres'].str.strip()
movies_df['Directors'] = movies_df['Directors'].str.strip()

# Verify the changes
movies_df.info(), movies_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            1000 non-null   int64         
 1   Description   1000 non-null   object        
 2   Title         1000 non-null   object        
 3   rating        1000 non-null   float64       
 4   runtime-mins  1000 non-null   int64         
 5   Year          1000 non-null   int64         
 6   Genres        1000 non-null   object        
 7   release-date  1000 non-null   datetime64[ns]
 8   Directors     1000 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB


(None,
    id                               Description  \
 0   1  Worldwide Lifetime Gross: $2,923,706,026   
 1   2  Worldwide Lifetime Gross: $2,799,439,100   
 2   3  Worldwide Lifetime Gross: $2,320,250,281   
 3   4  Worldwide Lifetime Gross: $2,264,750,694   
 4   5  Worldwide Lifetime Gross: $2,071,310,218   
 
                                         Title  rating  runtime-mins  Year  \
 0                                      Avatar     7.9           162  2009   
 1                           Avengers: Endgame     8.4           181  2019   
 2                    Avatar: The Way of Water     7.5           192  2022   
 3                                     Titanic     7.9           194  1997   
 4  Star Wars: Episode VII - The Force Awakens     7.8           138  2015   
 
                                Genres release-date                 Directors  
 0  Action, Adventure, Fantasy, Sci-Fi   2009-12-18             James Cameron  
 1    Action, Adventure, Drama, Sci-Fi   2019-04-

In [16]:
movies_df.head()

Unnamed: 0,id,Description,Title,rating,runtime-mins,Year,Genres,release-date,Directors
0,1,"Worldwide Lifetime Gross: $2,923,706,026",Avatar,7.9,162,2009,"Action, Adventure, Fantasy, Sci-Fi",2009-12-18,James Cameron
1,2,"Worldwide Lifetime Gross: $2,799,439,100",Avengers: Endgame,8.4,181,2019,"Action, Adventure, Drama, Sci-Fi",2019-04-26,"Anthony Russo, Joe Russo"
2,3,"Worldwide Lifetime Gross: $2,320,250,281",Avatar: The Way of Water,7.5,192,2022,"Action, Adventure, Fantasy, Sci-Fi",2022-12-16,James Cameron
3,4,"Worldwide Lifetime Gross: $2,264,750,694",Titanic,7.9,194,1997,"Drama, Romance",1997-12-19,James Cameron
4,5,"Worldwide Lifetime Gross: $2,071,310,218",Star Wars: Episode VII - The Force Awakens,7.8,138,2015,"Action, Adventure, Sci-Fi",2015-12-18,J.J. Abrams


In [17]:
# Extract the numerical part from the 'Description' column
movies_df['Gross_Millions'] = movies_df['Description'].apply(lambda x: int(re.sub(r'[^\d]', '', x)))

# Drop the original 'Description' column
movies_df = movies_df.drop(columns=['Description'])

In [18]:
# Convert the gross to millions and round
movies_df['Gross_Millions'] = movies_df['Gross_Millions'].apply(lambda x: f"{round(x / 1e6):,}M")

# Remove the 'M' suffix and convert the values to integers
movies_df['Gross_Millions'] = movies_df['Gross_Millions'].str.replace('M', '').str.replace(',', '').astype(int)

# Display the first few rows to verify the changes
movies_df.head()

Unnamed: 0,id,Title,rating,runtime-mins,Year,Genres,release-date,Directors,Gross_Millions
0,1,Avatar,7.9,162,2009,"Action, Adventure, Fantasy, Sci-Fi",2009-12-18,James Cameron,2924
1,2,Avengers: Endgame,8.4,181,2019,"Action, Adventure, Drama, Sci-Fi",2019-04-26,"Anthony Russo, Joe Russo",2799
2,3,Avatar: The Way of Water,7.5,192,2022,"Action, Adventure, Fantasy, Sci-Fi",2022-12-16,James Cameron,2320
3,4,Titanic,7.9,194,1997,"Drama, Romance",1997-12-19,James Cameron,2265
4,5,Star Wars: Episode VII - The Force Awakens,7.8,138,2015,"Action, Adventure, Sci-Fi",2015-12-18,J.J. Abrams,2071


In [20]:
movies_df.to_csv('movies_updated.csv')