import requests
import pandas as pd
import time

# API details
api_key = '8265bd1679663a7ea12ac168da84d2e8'
base_url = 'https://api.themoviedb.org/3/movie/top_rated'
language = 'en-US'
total_pages = 471
max_retries = 5

# List to hold movie data
movies = []

# Function to fetch data from API with retry mechanism
def fetch_data_with_retries(url, params, max_retries=5):
    retries = 0
    backoff_time = 1  # initial backoff time in seconds
    while retries < max_retries:
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except (requests.exceptions.RequestException, requests.exceptions.ConnectionError) as e:
            print(f"Error: {e}, Retrying... ({retries+1}/{max_retries})")
            retries += 1
            time.sleep(backoff_time)
            backoff_time *= 2  # exponential backoff
    raise Exception("Max retries exceeded")

# Fetch movie data from all pages
for page in range(1, total_pages + 1):
    print(f"Fetching page {page}/{total_pages}")
    params = {'api_key': api_key, 'language': language, 'page': page}
    try:
        data = fetch_data_with_retries(base_url, params, max_retries)
    except Exception as e:
        print(f"Failed to fetch data for page {page}: {e}")
        continue
    
    # Extract required fields
    for movie in data['results']:
        movie_info = {
            'title': movie['title'],
            'description': movie['overview'],
            'genres': movie['genre_ids']
        }
        movies.append(movie_info)

# Convert to DataFrame
df = pd.DataFrame(movies)

df

api_key = '8265bd1679663a7ea12ac168da84d2e8'
genre_url = 'https://api.themoviedb.org/3/genre/movie/list'
language = 'en-US'

# Fetch genre data
def fetch_genres(api_key, genre_url, language):
    params = {'api_key': api_key, 'language': language}
    response = requests.get(genre_url, params=params)
    data = response.json()
    return {genre['id']: genre['name'] for genre in data['genres']}

# Fetch genres
genre_mapping = fetch_genres(api_key, genre_url, language)

# Function to replace genre IDs with names
def replace_genre_ids(genres):
    return [genre_mapping.get(genre_id, 'Unknown') for genre_id in eval(genres)]

# Replace genre IDs with names in the DataFrame
df['genres'] = df['genres'].apply(replace_genre_ids)

df

# Save the updated DataFrame
df.to_csv('movies_dataset_with_genres.csv', index=False)

df.head(5)

#                                                           Now let's move to texrt pre-processing  

#### First Lowercasing the description 

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('movies_dataset_with_genres.csv')

In [3]:
df.head()

Unnamed: 0,title,description,genres
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"['Drama', 'Crime']"
3,Schindler's List,The true story of how businessman Oskar Schind...,"['Drama', 'History', 'War']"
4,12 Angry Men,The defense and the prosecution have rested an...,['Drama']


In [4]:
df['description'] = df['description'].str.lower()

In [5]:
df.head()

Unnamed: 0,title,description,genres
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,The Godfather,"spanning the years 1945 to 1955, a chronicle o...","['Drama', 'Crime']"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"['Drama', 'Crime']"
3,Schindler's List,the true story of how businessman oskar schind...,"['Drama', 'History', 'War']"
4,12 Angry Men,the defense and the prosecution have rested an...,['Drama']


 #### Now let's remove punctuation

In [6]:
import string

In [7]:
def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(str.maketrans('', '', string.punctuation))
    return ''

In [8]:
# Remove punctuation from the 'description' column
df['description'] = df['description'].apply(remove_punctuation)

In [9]:
df.head()

Unnamed: 0,title,description,genres
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']"
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"['Drama', 'Crime']"
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"['Drama', 'Crime']"
3,Schindler's List,the true story of how businessman oskar schind...,"['Drama', 'History', 'War']"
4,12 Angry Men,the defense and the prosecution have rested an...,['Drama']


#### Let's do spelling checking 

from textblob import TextBlob

def spelling_check(text):
    return str(TextBlob(text).correct())

def correct_spelling(text):
    return str(TextBlob(text).correct())

df['description'] = df['description'].apply(correct_spelling)

df.head()

#### Now let's move to Word Tokenization step 

In [14]:
import spacy

In [15]:
nlp=spacy.load('en_core_web_sm')

In [19]:
def word_tokenization(text):
    doc=nlp(text)
    return(token.text for token in doc)

In [24]:
# Apply word tokenization to the 'description' column
df['description_tokens'] = df['description'].apply(lambda x: list(word_tokenization(x)))

In [25]:
df.head()

Unnamed: 0,title,description,genres,description_tokens
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']","[imprisoned, in, the, 1940s, for, the, double,..."
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"['Drama', 'Crime']","[spanning, the, years, 1945, to, 1955, a, chro..."
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"['Drama', 'Crime']","[in, the, continuing, saga, of, the, corleone,..."
3,Schindler's List,the true story of how businessman oskar schind...,"['Drama', 'History', 'War']","[the, true, story, of, how, businessman, oskar..."
4,12 Angry Men,the defense and the prosecution have rested an...,['Drama'],"[the, defense, and, the, prosecution, have, re..."


#### now let's do Lemmatization

In [28]:
# Function to lemmatize text
def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

In [30]:
# Apply lemmatization to the 'description' column
df['description_lemmatized'] = df['description'].apply(lambda x: ' '.join(lemmatize_text(x)))

In [31]:
df.head()

Unnamed: 0,title,description,genres,description_tokens,description_lemmatized
0,The Shawshank Redemption,imprisoned in the 1940s for the double murder ...,"['Drama', 'Crime']","[imprisoned, in, the, 1940s, for, the, double,...",imprison in the 1940 for the double murder of ...
1,The Godfather,spanning the years 1945 to 1955 a chronicle of...,"['Drama', 'Crime']","[spanning, the, years, 1945, to, 1955, a, chro...",span the year 1945 to 1955 a chronicle of the ...
2,The Godfather Part II,in the continuing saga of the corleone crime f...,"['Drama', 'Crime']","[in, the, continuing, saga, of, the, corleone,...",in the continue saga of the corleone crime fam...
3,Schindler's List,the true story of how businessman oskar schind...,"['Drama', 'History', 'War']","[the, true, story, of, how, businessman, oskar...",the true story of how businessman oskar schind...
4,12 Angry Men,the defense and the prosecution have rested an...,['Drama'],"[the, defense, and, the, prosecution, have, re...",the defense and the prosecution have rest and ...


In [32]:

df.to_csv('movies_dataset_with_genres.csv', index=False)