In [2]:
import requests
import pandas as pd
import time

In [None]:
# Define the headers with Bearer token for authorization
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJjMjNlZGZjN2I5YmZjMTU5ZmQ1Y2I5MmQ5YjU1NDYzMCIsIm5iZiI6MTcyOTk0ODE1Ni4yODI5NjMsInN1YiI6IjY3MDExYjAxYjE0NjI4MmY3Yjg1NTMwMyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.5Y0PjdKrEff1LvIjVVXEki5leFFa6H1nE5s0opipDMM"
}

def fetch_indian_movies_with_imdb_ids(start_year, end_year):
    movies_data = []

    for year in range(start_year, end_year + 1):
        page = 1
        while True:
            # Construct the URL with specified parameters for each year and page
            url = (f"https://api.themoviedb.org/3/discover/movie?include_adult=false&include_video=false"
                   f"&language=en-US&region=IN&sort_by=popularity.desc&with_origin_country=IN"
                   f"&year={year}&page={page}")
            response = requests.get(url, headers=headers).json()

            # Check for any errors in the response
            if response.get("status_code"):
                print(f"Error {response['status_code']}: {response.get('status_message')}")
                break

            # Check if results are present
            if 'results' not in response or len(response['results']) == 0:
                break

            for movie in response['results']:
                tmdb_id = movie['id']

                # Fetch IMDb ID and detailed movie information
                imdb_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}"
                imdb_response = requests.get(imdb_url, headers=headers).json()

                # Check for any errors in the movie details response
                if imdb_response.get("status_code"):
                    print(f"Error fetching movie details: {imdb_response.get('status_message')}")
                    continue

                # Fetch cast and crew information
                credits_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/credits"
                credits_response = requests.get(credits_url, headers=headers).json()

                # Extract main cast (first 5 cast members) and directors from crew
                main_cast = [member['name'] for member in credits_response.get('cast', [])[:5]]
                directors = [member['name'] for member in credits_response.get('crew', []) if member['job'] == 'Director']

                # Fetch keywords
                keywords_url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/keywords"
                keywords_response = requests.get(keywords_url, headers=headers).json()
                keywords = [keyword['name'] for keyword in keywords_response.get('keywords', [])]

                # Append all movie details to the list if IMDb ID is available
                if 'imdb_id' in imdb_response:
                    movies_data.append({
                        'title': imdb_response.get('title'),
                        'imdb_id': imdb_response.get('imdb_id', None),
                        'release_date': imdb_response.get('release_date', None),
                        # 'runtime': imdb_response.get('runtime'),
                        'genres': ", ".join([genre['name'] for genre in imdb_response.get('genres', [])]) or None,
                        'overview': imdb_response.get('overview', None),
                        # 'vote_average': imdb_response.get('vote_average'),
                        # 'vote_count': imdb_response.get('vote_count'),
                        'popularity': imdb_response.get('popularity', None),
                        # 'budget': imdb_response.get('budget'),
                        'revenue': imdb_response.get('revenue', None),
                        'main_cast': ", ".join(main_cast) if main_cast else None,
                        'directors': ", ".join(directors) if directors else None,
                          'keywords': ", ".join(keywords)  if keywords else None,
                    })

            # Stop if no more pages are available
            if page >= response.get('total_pages', 1):
                break
            page += 1

            # To avoid hitting the API rate limit
            time.sleep(0.25)

    # Create DataFrame from the collected movie data
    df = pd.DataFrame(movies_data)
    return df

# Fetch Indian movies from 1990 to 2024 and display as DataFrame
indian_movies_df = fetch_indian_movies_with_imdb_ids(1960, 2024)


In [None]:
indian_movies_df.to_csv('/content/drive/MyDrive/Project/indian_movies.csv', index=False)

In [37]:
    indian_movies_df = pd.read_csv(
        '/content/drive/MyDrive/Project/indian_movies.csv',
        encoding='utf-8',  # Explicitly set encoding to UTF-8
        on_bad_lines='skip',  # Skip lines with parsing errors
        engine='python'  # Use the Python engine for more flexibility
    )

In [38]:
indian_movies_df.isnull().sum()

Unnamed: 0,0
title,0
imdb_id,3604
release_date,25
genres,4062
overview,1188
popularity,13
revenue,14
main_cast,2001
directors,1566
keywords,18763


In [39]:
indian_movies_df=indian_movies_df.dropna(subset=['imdb_id'])

In [40]:
indian_movies_df.isnull().sum()

Unnamed: 0,0
title,0
imdb_id,0
release_date,14
genres,3018
overview,845
popularity,9
revenue,10
main_cast,1249
directors,1253
keywords,15691


In [41]:
# OMDb API key
omdb_api_key = 'd14d1eaf'  # Replace with your actual OMDb API key

def fetch_from_omdb(imdb_id):
    """Fetch movie details from OMDb API."""
    omdb_url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={omdb_api_key}"
    response = requests.get(omdb_url).json()

    # Return None if the movie is not found
    if response.get("Response") == "False":
        return None

    return {
        'imdb_id': response.get('imdbID', None),
        'release_date': response.get('Released', None),
        'genres': response.get('Genre', None),
        'overview': response.get('Plot', None),
    }


In [42]:
def update_missing_data(indian_movies_df):
    """Update missing movie details in the DataFrame by fetching from OMDb."""
    for index, row in indian_movies_df.iterrows():
        # Check if any of the necessary fields are None
        if pd.isna(row['release_date']) or pd.isna(row['genres']) or pd.isna(row['overview']):
            if row['imdb_id']:
                omdb_data = fetch_from_omdb(row['imdb_id'])

                # If data is returned from OMDb, update the DataFrame
                if omdb_data:
                    if pd.isna(row['release_date']):
                        indian_movies_df.at[index, 'release_date'] = omdb_data['release_date']
                    if pd.isna(row['genres']):
                        indian_movies_df.at[index, 'genres'] = omdb_data['genres']
                    if pd.isna(row['overview']):
                        indian_movies_df.at[index, 'overview'] = omdb_data['overview']

    return indian_movies_df



In [36]:
# Example usage
# Assuming you have the indian_movies_df DataFrame ready
# indian_movies_df = pd.DataFrame([...])  # Your existing DataFrame
indian_movies_df = update_missing_data(indian_movies_df)

KeyboardInterrupt: 

In [43]:
indian_movies_df.isnull().sum()

Unnamed: 0,0
title,0
imdb_id,0
release_date,14
genres,3018
overview,845
popularity,9
revenue,10
main_cast,1249
directors,1253
keywords,15691


In [44]:
indian_movies_df.sample(5)

Unnamed: 0,title,imdb_id,release_date,genres,overview,popularity,revenue,main_cast,directors,keywords
8288,Qaidi,tt2009559,2002-05-03,Action,Ravi Verma is an efficient and sincere C.B.I o...,0.942,0.0,"Archana, Mithun Chakraborty, Raza Murad, Jhony...",T. L. V. Prasad,
10855,Josh,tt12958872,2009-04-10,"Comedy, Drama","Rockey, the son of a middle-class couple, fall...",0.389,0.0,"Rakesh Adiga, Sharan, Nithya Menen, Poorna",Shivamani,"josh, rakesh adiga, shivamani"
13487,Alasyam Amrutham,tt1795548,2010-12-03,,From Chandra Mahesh,0.048,0.0,"Nikhil Siddhartha, Arvind Krishna, Madalasa Sh...",Chandra Mahesh,
23297,Sucha Soorma,tt11090316,2024-09-20,"History, Thriller",An uncompromising folklore figure of the early...,5.404,0.0,"Babbu Mann, Suvinder Vicky, Mahabir Bhullar, J...",Amitoj Mann,historical drama
19931,Palthu Janwar,tt21238288,2022-09-02,"Drama, Comedy",Palthu Janwar is a refreshing take on the rela...,13.304,0.0,"Basil Joseph, Dileesh Pothan, Indrans, Johny A...",Sangeeth P. Rajan,


In [45]:
indian_movies_df=indian_movies_df[['imdb_id','title','genres','overview','directors','main_cast','keywords']]

In [46]:
indian_movies_df.rename(columns = {'imdb_id':'movie_id','title':'movie_name', 'genres':'genre','directors':'director', 'main_cast':'cast' }, inplace = True)

In [47]:
indian_movies_df.head(2)

Unnamed: 0,movie_id,movie_name,genre,overview,director,cast,keywords
0,tt0048473,Pather Panchali,Drama,"Impoverished priest Harihar Ray, dreaming of a...",Satyajit Ray,"Subir Banerjee, Uma Das Gupta, Karuna Banerjee...","robbery, misery, difficult childhood, move, mo..."
1,tt0052572,Apur Sansar,Drama,Apu is a jobless ex-student dreaming vaguely o...,Satyajit Ray,"Soumitra Chatterjee, Sharmila Tagore, Alok Cha...","dying and death, broken engagement, arranged m..."


In [23]:
df=pd.read_csv('/content/drive/MyDrive/Project/final_movie_dataset_with_keyword.csv', index_col=0)

In [48]:
df=df.drop(['year','movie_id2'], axis=1)

In [49]:
df

Unnamed: 0,movie_id,movie_name,genre,overview,director,cast,Keywords
0,tt15354916,Jawan,"Action, Thriller",A high-octane action thriller which outlines t...,Atlee,"Shah Rukh Khan, Nayanthara, Vijay Sethupathi, ...","revenge, chase, suicide, farmer, train, injust..."
1,tt15748830,Jaane Jaan,"Crime, Drama, Mystery",A single mother and her daughter who commit a ...,Sujoy Ghosh,"Kareena Kapoor, Jaideep Ahlawat, Vijay Varma, ...","single mother, police, investigation, criminal..."
2,tt11663228,Jailer,"Action, Comedy, Crime",A retired jailer goes on a manhunt to find his...,Nelson Dilipkumar,"Rajinikanth, Mohanlal, Shivarajkumar, Jackie S...","prison, ex cop, intermission, action hero, one..."
3,tt14993250,Rocky Aur Rani Kii Prem Kahaani,"Comedy, Drama, Family",Flamboyant Punjabi Rocky and intellectual Beng...,Karan Johar,"Ranveer Singh, Alia Bhatt, Dharmendra, Shabana...","love, family relationships, rom com, news repo..."
4,tt15732324,OMG 2,"Comedy, Drama",An unhappy civilian asks the court to mandate ...,Amit Rai,"Pankaj Tripathi, Akshay Kumar, Yami Gautam, Pa...","courtroom, sex education, court room drama, re..."
...,...,...,...,...,...,...,...
4026,tt33808098,Cycle Mahesh,unknown,A young worker's epic bicycle journey home and...,Suhel Banerjee,unknown,
4027,tt33808618,Maryade Prashne,unknown,Unknown,Nagaraja Somayaji,"Poornachandra Mysuru, Rekha Kudligi, Teju Bela...",
4028,tt33810459,Uruttu Factory,Documentary,Unknown,"Sid, Sid",Sid,
4029,tt33812085,Satchi Perumal (Witness Perumal),Drama,'Satchi Perumal' with the main lead Perumal fr...,RP. Vinnu,"Pandiamma, Verra Paramasivam, V P. Rajasekar, ...",


In [54]:
indian_movies_df

Unnamed: 0,movie_id,movie_name,genre,overview,director,cast,keywords
0,tt0048473,Pather Panchali,Drama,"Impoverished priest Harihar Ray, dreaming of a...",Satyajit Ray,"Subir Banerjee, Uma Das Gupta, Karuna Banerjee...","robbery, misery, difficult childhood, move, mo..."
1,tt0052572,Apur Sansar,Drama,Apu is a jobless ex-student dreaming vaguely o...,Satyajit Ray,"Soumitra Chatterjee, Sharmila Tagore, Alok Cha...","dying and death, broken engagement, arranged m..."
2,tt0398974,Dr. Shaitan,"Science Fiction, Horror",The power mad Dr. Shaitan (Sheikh Mukhtar) use...,Shreeram Bohra,"Sheikh Mukhtar, Premnath Malhotra, Shakila, He...",
3,tt0054302,Shriman Satyawadi,"Action, Crime, Drama",An honest man instills in his only child the a...,S.M. Abbas,"Raj Kapoor, Shakila, Mehmood, Nazir Hussain, R...",
4,tt0053765,Devi,Drama,A devout upper-class Hindu has a vision in a d...,Satyajit Ray,"Sharmila Tagore, Soumitra Chatterjee, Chhabi B...",preserved film
...,...,...,...,...,...,...,...
24295,tt31495418,India’s 1st Best Trans Model Agency,Documentary,Rudrani Chettri has set up a model agency for ...,Ila Mehrotra Jenkins,,
24296,tt26733322,Bheema,Action,"Bheema, a gangster, wages a war against Dragon...",Duniya Vijay,"Duniya Vijay, Priya Shatamarshan, Dragon Manju...",
24297,tt16420454,Oneness: The Movie,,"In 2013, Ivan Martin, an 18-year-old boy, was ...",Priyakanta Laishram,"Priyakanta Laishram, Maya Chowdhry, Suraj Ngas...",
24302,tt33513689,Skyward,,"Set against a surreal landscape, this is a tal...",Suruchi Sharma,,


In [52]:
new_dataframe = pd.concat([df, indian_movies_df]).drop_duplicates(subset='movie_id')

In [53]:
new_dataframe.shape

(20834, 8)

In [58]:
df[df['movie_id']=='tt0053765']

Unnamed: 0,movie_id,movie_name,genre,overview,director,cast,Keywords


In [60]:
new_dataframe.to_csv('/content/drive/MyDrive/Project/final_dataset_with_20000_movies.csv')