In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
movies = pd.read_csv('./data/2/movies_cleaned.csv')
recommendation = pd.read_csv('./data/2/recommendation_cleaned.csv')
reviews = pd.read_csv('./data/2/reviews_cleaned.csv')
search = pd.read_csv('./data/2/search_cleaned.csv')
users = pd.read_csv('./data/2/users_cleaned.csv')
watch = pd.read_csv('./data/2/watch_cleaned.csv')

## 컬럼 분류

In [None]:
# movies
movies_numeric = ['release_year', 'duration_minutes', 'imdb_rating']
movies_categorical = ['movie_id', 'title', 'content_type', 'genre_primary', 'rating', 'language', 'country_of_origin', 'added_to_platform']
movies_boolean = ['is_netflix_original', 'content_warning']

# recommendation
recommendation_numeric = ['position_in_list']
recommendation_categorical = ['recommendation_id', 'user_id', 'movie_id', 'recommendation_date', 'recommendation_type', 'device_type', 'time_of_day', 'algorithm_version']
recommendation_boolean = ['was_clicked']

# reviews
reviews_numeric = ['rating', 'helpful_votes', 'total_votes', 'sentiment_score']
reviews_categorical = ['review_id', 'user_id', 'movie_id', 'review_date', 'device_type', 'review_text', 'sentiment']
reviews_boolean = ['is_verified_watch']

# search
search_numeric = ['results_returned', 'search_duration_seconds']
search_categorical = ['search_id', 'user_id', 'search_query', 'search_date', 'device_type', 'location_country']
search_boolean = ['had_typo', 'used_filters']

# users
users_numeric = ['age', 'monthly_spend', 'household_size']
users_categorical = ['user_id', 'email', 'first_name', 'last_name', 'gender', 'country', 'state_province', 'city', 'subscription_plan', 'subscription_start_date', 'primary_device', 'created_at']
users_boolean = ['is_active']

# watch
watch_numeric = ['watch_duration_minutes', 'progress_percentage']
watch_categorical = ['session_id', 'user_id', 'movie_id', 'watch_date', 'device_type', 'action', 'quality', 'location_country']
watch_boolean = ['is_download']

## 1. 데이터 병합 및 전처리

In [ ]:
# user와 watch 데이터 병합
user_watch = pd.merge(users, watch, on='user_id', how='inner')

# user_watch와 movies 데이터 병합
merged_df = pd.merge(user_watch, movies, on='movie_id', how='inner')

# 연령대 컬럼 추가
bins = [0, 19, 29, 39, 49, 59, 100]
labels = ['10대 이하', '20대', '30대', '40대', '50대', '60대 이상']
users['age_group'] = pd.cut(users['age'], bins=bins, labels=labels, right=False)
merged_df['age_group'] = pd.cut(merged_df['age'], bins=bins, labels=labels, right=False)

## 2. 연령대별 선호 장르 분석

In [ ]:
age_genre = merged_df.groupby(['age_group', 'genre_primary'])['movie_id'].count().reset_index()
age_genre_pivot = age_genre.pivot_table(index='age_group', columns='genre_primary', values='movie_id', fill_value=0)

plt.figure(figsize=(18, 10))
sns.heatmap(age_genre_pivot, cmap='viridis', annot=True, fmt='.0f')
plt.title('연령대별 선호 장르 (시청 횟수 기준)')
plt.xlabel('장르')
plt.ylabel('연령대')
plt.show()

## 3. 연령대별 월간 지출액 및 구독 요금제 분석

In [ ]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='monthly_spend', data=users, palette='pastel')
plt.title('연령대별 월간 지출액')
plt.xlabel('연령대')
plt.ylabel('월간 지출액')
plt.show()

In [ ]:
age_subscription = users.groupby(['age_group', 'subscription_plan'])['user_id'].count().unstack().fillna(0)

age_subscription.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
plt.title('연령대별 구독 요금제 분포')
plt.xlabel('연령대')
plt.ylabel('사용자 수')
plt.xticks(rotation=45)
plt.legend(title='구독 요금제')
plt.show()

## 4. 연령대별 시청 행태 분석

In [ ]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='watch_duration_minutes', data=merged_df, palette='coolwarm', showfliers=False) # 이상치 제외
plt.title('연령대별 평균 시청 시간 (분)')
plt.xlabel('연령대')
plt.ylabel('시청 시간 (분)')
plt.show()

In [ ]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='age_group', y='progress_percentage', data=merged_df, palette='magma', showfliers=False) # 이상치 제외
plt.title('연령대별 시청 완료율 (%)')
plt.xlabel('연령대')
plt.ylabel('시청 완료율 (%)')
plt.show()

## 5. 연령대별 주로 사용하는 기기 분석

In [ ]:
plt.figure(figsize=(14, 7))
sns.countplot(x='age_group', hue='primary_device', data=users, palette='plasma')
plt.title('연령대별 주 사용 기기')
plt.xlabel('연령대')
plt.ylabel('사용자 수')
plt.legend(title='기기 종류')
plt.show()