In [1]:
"""Codigo completa el proceso de ETL luego del EDA"""

'Codigo completa el proceso de ETL luego del EDA'

In [2]:
import pandas as pd
import numpy as np
import os
import nltk

In [3]:
# Definir la ruta base de los archivos
# Asegúrate de cambiar esta ruta si el dataset está en otra ubicación
RUTA_BASE = "C:/Users/oscar/Desktop/P1-HENRY/data_set"
#  Definir la ruta del entorno virtual para descargar NLTK
NLTK_PATH = "C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data"
nltk.data.path.append(NLTK_PATH)


In [4]:
# Cargar el dataset final en formato Parquet
ruta_final_parquet = os.path.join(RUTA_BASE, "movies_final.parquet")
movies_df = pd.read_parquet(ruta_final_parquet)

In [5]:
# Filtrar solo películas a partir de 1975
movies_df = movies_df[movies_df['release_year'] >= 1975]


In [6]:
# Conversión de tipos de datos
columns_to_numeric = ['popularity', 'vote_count', 'budget', 'revenue']
for col in columns_to_numeric:
    movies_df[col] = pd.to_numeric(movies_df[col], errors='coerce')

movies_df[['budget', 'revenue']] = movies_df[['budget', 'revenue']].fillna(0)
movies_df['vote_average'] = movies_df['vote_average'].astype(float)
movies_df['release_year'] = movies_df['release_year'].astype(int)

In [7]:
# Filtrar solo películas en los idiomas principales
idiomas_principales = ['en', 'es', 'fr', 'it', 'de']
movies_df = movies_df[movies_df['original_language'].isin(idiomas_principales)]

In [8]:
# Eliminar columnas innecesarias para optimización
columns_to_drop = ['return', 'budget', 'log_revenue', 'log_budget', 
                   'log_popularity', 'log_vote_count', 'production_countries_id', 'btc_id']
movies_df = movies_df.drop(columns=columns_to_drop, errors='ignore')

In [9]:
# Preprocesamiento del texto (descripción de la película)
nltk.download('punkt', download_dir=NLTK_PATH)
nltk.download('stopwords', download_dir=NLTK_PATH)
nltk.download('wordnet', download_dir=NLTK_PATH)
nltk.download('omw-1.4', download_dir=NLTK_PATH)
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_PATH)

[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:/Users/oscar/Desktop/P1-HENRY/env/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


In [11]:
def preprocess_text(text):
    """Limpia y normaliza el texto eliminando puntuaciones, stopwords y aplicando lematización."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)
# Aplicar preprocesamiento en la columna 'overview'
movies_df['overview'] = movies_df['overview'].fillna("No description available.")


In [12]:
# Crear columna de predicción combinando diferentes atributos
movies_df['predictor'] = (
    movies_df['overview'] + " " +  # Descripción de la película
    movies_df['genres'].fillna('') + " " +  # Géneros
    movies_df['actors'].fillna('') + " " +  # Actores principales
    movies_df['director'].fillna('') + # Director
    movies_df['vote_average'].astype(str) + " " +
    movies_df['popularity'].astype(str)
)


In [13]:
#  Exportar datos procesados en formato Parquet
ruta_salida_parquet = os.path.join(RUTA_BASE, "movies_dataset_processed.parquet")
movies_df.to_parquet(ruta_salida_parquet, engine='pyarrow', compression='snappy', index=False)

print("✅ Datos procesados y guardados correctamente en:", ruta_salida_parquet)

✅ Datos procesados y guardados correctamente en: C:/Users/oscar/Desktop/P1-HENRY/data_set\movies_dataset_processed.parquet
