## 드라마 데이터 전처리

In [1928]:
import pandas as pd

df = pd.read_parquet("dramas_merged_final.parquet")



In [1929]:
# 드라마만 필터링된 파일 불러오기 (드라마 vote_count>=30 필터 + 비드라마 제외 + director, writer, top_cast 반영)
dramas = pd.read_parquet('dramas_merged_final.parquet')

In [1930]:
# 드라마 타입 정보 확인
dramas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3581 entries, 0 to 3580
Data columns (total 45 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   imdb_rating_count                 3413 non-null   float64
 1   imdb_rating                       3413 non-null   float64
 2   keyword                           3212 non-null   object 
 3   imdb_id                           3581 non-null   object 
 4   production_companies              3334 non-null   object 
 5   created_by                        3094 non-null   object 
 6   poster_path                       3579 non-null   object 
 7   type_detail                       3581 non-null   object 
 8   seasons                           3581 non-null   object 
 9   providers_flatrate                3476 non-null   object 
 10  overview                          3518 non-null   object 
 11  genre_ids                         3581 non-null   object 
 12  genres

In [1931]:
dramas['seasons'].unique()


array(['S0: Specials (30 eps); S1: Season 1 (16 eps); S2: Season 2 (22 eps); S3: Season 3 (21 eps); S4: Season 4 (16 eps); S5: Season 5 (18 eps); S6: Season 6 (22 eps); S7: Season 7 (13 eps)',
       'S0: Specials (1 eps); S1: Season 1 (10 eps); S2: Season 2 (12 eps); S3: Season 3 (15 eps); S4: Season 4 (13 eps); S5: Season 5 (13 eps); S6: Season 6 (13 eps); S7: Season 7 (13 eps); S8: Season 8 (13 eps)',
       'S0: Specials (0 eps); S1: Season 1 (22 eps); S2: Season 2 (20 eps); S3: Season 3 (22 eps); S4: Season 4 (13 eps); S5: Season 5 (22 eps); S6: Season 6 (20 eps); S7: Season 7 (10 eps); S8: Season 8 (8 eps); S9: Season 9 (10 eps)',
       ..., 'S0: Specials (2 eps); S1: Miniseries (8 eps)',
       'S0: Specials (2 eps); S1: Limited Series (12 eps)',
       'S1: Season 1 (16 eps); S2: Book 2 (The Dark Side) (1 eps)'],
      dtype=object)

In [1932]:
for company in dramas['production_companies'].unique():
    print(company)

Pointe Studios, Picturemaker Productions, Grammnet Productions, Paramount Television, CBS Studios
Tilted Productions, Lionsgate Television
None
Coquette Productions, Matthew Carnahan Circus Products, Touchstone Television, FX Productions, ABC Signature, Sprockets Music
ABC Studios, AfterPortsmouth Productions, Touchstone Television, Berlanti Productions
20th Century Fox Television
Junction Entertainment, Fixed Mark Productions, CBS Studios
Imagine Television Studios, 20th Century Fox Television
Shine TV
BBC Cymru Wales, CBC, Starz, BBC Worldwide Productions
The Barry Schindel Company, Post 109 Productions, Paramount Television, CBS Studios, Scott Free Productions
Blackfriars Bridge Films
UCP
Company Pictures, Stormdog Films, E4
Mediaset España, Globomedia
Lionsgate, RadicalMedia, Weiner Bros.
BBC, Monastic Productions, BBC Cymru Wales
Aggressive Mediocrity, Showtime Networks, Totally Commercial Films, And Then..., Twilight Time Films
The Lloyd Segan Company, Pillar Squared, E1 Entertai

In [1933]:
# 드라마 결측률 확인
dramas.isna().mean().sort_values(ascending=False)

created_by                          0.135996
keyword                             0.103044
production_companies                0.068975
imdb_rating_count                   0.046914
imdb_rating                         0.046914
providers_flatrate                  0.029321
overview                            0.017593
origin_country                      0.000838
poster_path                         0.000559
character                           0.000279
writers_name                        0.000279
popularity                          0.000279
executive_producer_name             0.000279
top_cast_gender                     0.000279
executive_producer_ids              0.000279
executive_producer_gender           0.000279
executive_producer_profile_path     0.000279
writer_roles                        0.000279
top_cast_ids                        0.000279
writer_ids                          0.000279
writer_gender                       0.000279
writer_profile_path                 0.000279
series_id 

In [1934]:
dramas_cleaned = dramas.copy()

### (1) 결측치 처리

In [1935]:
# 중요도 높은 결측치는 해당 행 제거
critical_cols = ['imdb_rating_count', 'imdb_rating', 'overview', 'top_cast_ids', 'writer_ids', 'series_id', 'top_cast_order', 'top_cast']
dramas_cleaned = dramas_cleaned.dropna(subset=critical_cols)

In [1936]:
# 중요도 낮은 컬럼 삭제
drop_cols = ['created_by']
dramas_cleaned = dramas_cleaned.drop(columns=drop_cols, errors='ignore') # errors='ignore': 있는 컬럼만 드롭하고, 없는 컬럼은 무시하고 넘어감


In [1937]:
# 중요도 낮은 텍스트/카테고리형 "none" 처리 
non_cols = ['keyword', 'production_companies', 'origin_country', 'poster_path', 'providers_flatrate',
                'character', 'writers_name', 'executive_producer_name', 'top_cast_gender',
                'executive_producer_ids', 'executive_producer_gender', 'executive_producer_profile_path', 
                'writer_roles', 'writer_gender', 'writer_profile_path', 'original_name', 'top_cast_profile_path'
                ]
dramas_cleaned[non_cols] = dramas_cleaned[non_cols].fillna("none")

In [1938]:
# 최종 결측률 확인
dramas_cleaned.isna().mean().sort_values(ascending=False)

imdb_rating_count                   0.0
imdb_rating                         0.0
in_production                       0.0
last_episode_to_air_vote_average    0.0
series_id                           0.0
original_name                       0.0
popularity                          0.0
executive_producer_name             0.0
executive_producer_ids              0.0
executive_producer_gender           0.0
executive_producer_profile_path     0.0
writers_name                        0.0
writer_roles                        0.0
writer_ids                          0.0
writer_gender                       0.0
writer_profile_path                 0.0
top_cast_order                      0.0
top_cast                            0.0
character                           0.0
top_cast_ids                        0.0
top_cast_gender                     0.0
id                                  0.0
episode_run_time                    0.0
number_of_seasons                   0.0
genre_ids                           0.0


# 중복 확인

In [1939]:
# 전체 행 중복 확인
print("\n[전체 행 기준 중복]")
print(f"Drama 중복: {dramas_cleaned.duplicated().sum():,}건")


[전체 행 기준 중복]
Drama 중복: 0건


In [1940]:
dramas_cleaned.columns

Index(['imdb_rating_count', 'imdb_rating', 'keyword', 'imdb_id',
       'production_companies', 'poster_path', 'type_detail', 'seasons',
       'providers_flatrate', 'overview', 'genre_ids', 'genres',
       'origin_country', 'last_air_date', 'first_air_date',
       'number_of_episodes', 'title', 'status', 'vote_average', 'vote_count',
       'original_language', 'number_of_seasons', 'episode_run_time', 'id',
       'in_production', 'last_episode_to_air_vote_average', 'series_id',
       'original_name', 'popularity', 'executive_producer_name',
       'executive_producer_ids', 'executive_producer_gender',
       'executive_producer_profile_path', 'writers_name', 'writer_roles',
       'writer_ids', 'writer_gender', 'writer_profile_path', 'top_cast_order',
       'top_cast', 'character', 'top_cast_ids', 'top_cast_gender',
       'top_cast_profile_path'],
      dtype='object')

In [1941]:
dramas_cleaned.to_parquet("dramas_preprocessed.parquet", index=False)
