In [4]:
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import json
import ast
import itertools

In [5]:
movies = pd.read_parquet("files/movies.parquet")

In [6]:
movies.columns

Index(['id', 'type', 'imdb_id', 'title', 'original_title', 'original_language',
       'spoken_languages', 'overview', 'tagline', 'status', 'release_date',
       'runtime', 'adult', 'budget', 'revenue', 'vote_count', 'vote_average',
       'popularity', 'genres', 'genre_ids', 'keywords', 'top_cast',
       'directors', 'writers', 'production_companies', 'production_countries',
       'providers_flatrate', 'providers_rent', 'providers_buy', 'poster_path',
       'imdb_rating', 'imdb_num_votes', 'metascore', 'user_score'],
      dtype='object')

In [7]:
movies['release_date'] = pd.to_datetime(movies['release_date'])
movies.dropna(subset=['genres', 'overview', 'poster_path'], inplace=True)
movies = movies[(movies['runtime'] > 45) & (movies['runtime'] <= 300)]
movies.reset_index(drop=True, inplace=True)

In [16]:
movies.isna().sum()

id                          0
type                        0
imdb_id                     0
title                       0
original_title              0
original_language           0
spoken_languages          196
overview                    0
tagline                 12224
status                      0
release_date                0
runtime                     0
adult                       0
budget                      0
revenue                     0
vote_count                  0
vote_average                0
popularity                  0
genres                      0
genre_ids                   0
keywords                 6050
top_cast                   96
directors                  14
writers                  1048
production_companies     1248
production_countries      399
providers_flatrate          0
providers_rent              0
providers_buy               0
poster_path                 0
imdb_rating                21
imdb_num_votes             21
metascore               16690
user_score

In [8]:
movies[['id', 'imdb_id', 'title', 'original_language', 'overview', 'release_date', 'runtime', 'genres', 'keywords', 'poster_path', 'vote_average', 'vote_count', 'imdb_rating', 'imdb_num_votes']].rename(columns={'vote_average':'tmdb_rating', 'vote_count':'tmdb_num_votes'}).to_parquet('files/movies_filtered.parquet')

In [9]:
movies.loc[(movies['budget'] != 0) & (movies['revenue'] != 0), ['id', 'imdb_id', 'budget', 'revenue']].to_parquet('files/movies_finance.parquet')

In [10]:
movies['genres']

0                                                Drama
1                                          Documentary
2                               Comedy, Drama, Romance
3                            Action, Adventure, Horror
4                         Adventure, Animation, Action
                             ...                      
26966                                  Romance, Comedy
26967                      Fantasy, Adventure, Romance
26968                                     Crime, Drama
26969                           Romance, Crime, Comedy
26970    Animation, Family, Comedy, Adventure, Mystery
Name: genres, Length: 26971, dtype: object

In [11]:
# 리스트 평탄화
def flatten_and_to_set(nested_lists):
    return list(set(list(itertools.chain.from_iterable(nested_lists))))

# 데이터 파싱
def optimized_provider_parse(data, column):

    dict_series = data[column].apply(ast.literal_eval)

    nested_lists_series = dict_series.apply(lambda x: list(x.values()))

    result = nested_lists_series.apply(flatten_and_to_set)

    return result

def table_normalization(data, column_list):
    nom_data = {}
    for column in column_list:
        data_parsed = data[['imdb_id', column]]
        nom_data[column] = data_parsed.explode(column).reset_index(drop=True)
        nom_data[column].to_parquet("files/" + column + ".parquet")

    print("행 정규화 완료")

def parsing_columns(data, columns):
    for column in columns:
        data[column] = data[column].str.split(', ')
    return data

In [12]:
movies_1 = movies.copy()
parsing_col = ['production_countries', 'genre_ids', 'keywords']
parsing_columns(movies_1, parsing_col)
parsing_col.append('providers_flatrate')
movies_1['providers_flatrate'] = optimized_provider_parse(movies_1, "providers_flatrate")
table_normalization(movies_1, parsing_col)

행 정규화 완료


In [None]:
load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")

# TMDB 영화 장르 목록을 가져오는 API 엔드포인트
URL = f"https://api.themoviedb.org/3/genre/movie/list?api_key={API_KEY}&language=en"

try:
    # API 요청 보내기
    response = requests.get(URL)
    response.raise_for_status()  # HTTP 오류 발생 시 예외 발생

    # JSON 데이터 파싱
    data = response.json()

    # 장르 목록 추출
    genres = data.get('genres', [])

    if genres:
        genres_df = pd.DataFrame(genres).rename(columns={'id':'genre_id', 'name':'genre'})
    else:
        print("❌ 장르 데이터를 찾을 수 없거나 응답이 비어 있습니다.")

except requests.exceptions.HTTPError as err:
    print(f"❌ HTTP 오류 발생: {err}")
    print("API 키를 확인하거나 요청 URL을 확인해 보세요.")
except requests.exceptions.RequestException as err:
    print(f"❌ 요청 오류 발생: {err}")
except json.JSONDecodeError:
    print("❌ 응답을 JSON으로 디코딩하는 데 실패했습니다.")

genre_ids = pd.read_parquet('files/genre_ids.parquet').rename(columns={'genre_ids':"genre_id"})
genre_ids['genre_id'] = genre_ids['genre_id'].astype('int')
pd.merge(genre_ids, genres_df, on = 'genre_id', how = 'left').to_parquet('files/genres.parquet')

NameError: name 'load_dotenv' is not defined

In [14]:
production_countries = pd.read_parquet('files/production_countries.parquet').dropna()
production_countries_cnt = production_countries['production_countries'].value_counts()
production_countries.loc[production_countries['production_countries'].isin(production_countries_cnt[20:].index), 'production_countries'] = 'Other'
production_countries['production_countries'] = production_countries['production_countries'].str.lower()
production_countries = production_countries.drop_duplicates()
production_countries.to_parquet('files/production_countries_f.parquet')

In [35]:
movies_filtered = pd.read_parquet('files/movies_filtered.parquet')

In [46]:
movies_filtered.loc[~movies_filtered['original_language'].isin(movies_filtered['original_language'].value_counts()[:10].index), 'original_language'] = 'xx'

In [49]:
movies_filtered.to_parquet('files/movies_filtered_f.parquet')