In [2]:
import pandas as pd

# Read the original CSV file
df = pd.read_csv('movies_with_mentions.csv')

# Extract only the movieId and movieName columns
df_extracted = df[['movieId', 'movieName']]

# Save the extracted data to a new CSV file
df_extracted.to_csv('extracted.csv', index=False)


In [6]:
df.shape


(6924, 2)

In [16]:
import pandas as pd
import requests
import time

# Read the original CSV file
df = pd.read_csv('movies_with_year.csv')

# Limit the dataframe to the first 5 movies
# Your OMDb API key
api_key = '61b12258'

# Function to fetch movie details
def fetch_movie_details(movie_names):
    actors_list = []
    directors_list = []
    
    for movie in movie_names:
        url = f"http://www.omdbapi.com/?t={movie}&apikey={api_key}"
        response = requests.get(url)
        data = response.json()
        
        if data['Response'] == 'True':
            actors = data.get('Actors', '')
            directors = data.get('Director', '')
            actors_list.append(actors)
            directors_list.append(directors)
        else:
            actors_list.append('')
            directors_list.append('')
    
    return actors_list, directors_list

# Process movies in batches
batch_size = 10  # Adjust this based on your API limit and number of movies
actors_collected = []
directors_collected = []

for start_idx in range(0, len(df), batch_size):
    batch_movies = df['movieName'].iloc[start_idx:start_idx + batch_size].tolist()
    actors_batch, directors_batch = fetch_movie_details(batch_movies)
    actors_collected.extend(actors_batch)
    directors_collected.extend(directors_batch)
    
    # Delay to respect API rate limits
    time.sleep(1)  # Adjust delay as needed
    
# Add the actors and directors to the DataFrame
df['Actors'] = actors_collected
df['Directors'] = directors_collected

# Save the updated DataFrame to a new CSV file
df.to_csv('movies_with_cast_and_crew.csv', index=False)


In [11]:
import pandas as pd
import re

# Read the original CSV file
df = pd.read_csv('extracted.csv')

# Limit the dataframe to the first 5 movies

# Function to extract year from movie name and remove it from the name
def extract_year(movie_name):
    match = re.search(r'\((\d{4})\)', movie_name)
    if match:
        year = match.group(1)
        movie_name = re.sub(r'\s*\(\d{4}\)\s*', '', movie_name)
    else:
        year = ''
    return movie_name, year

# Apply the function to each movie name
df['movieName'], df['Year'] = zip(*df['movieName'].apply(extract_year))

# Save the updated DataFrame to a new CSV file
df.to_csv('movies_with_year.csv', index=False)

print(df)


      movieId               movieName  Year
0       75796              Headhunter  2009
1       75815  Angels in the Outfield  1994
2       75822  Eddie and the Cruisers  1983
3       75828          Ninja Assassin  2009
4       75867                 Orgazmo  1997
...       ...                     ...   ...
6919   206079     2036 Origin Unknown  2018
6920   206080            The Hatching  2016
6921   206085           Hotel Artemis  2018
6922   206087              Hereditary  2018
6923   206092                  Mowgli  2019

[6924 rows x 3 columns]


In [17]:
import pandas as pd

# Read the original CSV file with updated data
df = pd.read_csv('movies_with_cast_and_crew.csv')

# Function to split and clean actor and director names
def split_and_clean(names):
    if pd.isna(names):
        return []
    return [name.strip() for name in names.split(',')]

# Extract and clean actor names
actor_names = df['Actors'].dropna().apply(split_and_clean).explode().unique()

# Extract and clean director names
director_names = df['Directors'].dropna().apply(split_and_clean).explode().unique()

# Create DataFrames for unique actor and director names
actors_df = pd.DataFrame(actor_names, columns=['ActorName'])
directors_df = pd.DataFrame(director_names, columns=['DirectorName'])

# Save the unique actor names to a CSV file
actors_df.to_csv('unique_actors.csv', index=False)

# Save the unique director names to a CSV file
directors_df.to_csv('unique_directors.csv', index=False)

print("Unique actors and directors have been saved to 'unique_actors.csv' and 'unique_directors.csv'.")


Unique actors and directors have been saved to 'unique_actors.csv' and 'unique_directors.csv'.
