In [None]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## MovieLens 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/data/movielens/'

In [None]:
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), index_col='userId', encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')

In [None]:
print(ratings_df.shape)
print(movies_df.shape)

## 장르 분석하기
  * 전체 장르 파악하기

In [None]:
movies_df.head()

In [None]:
all_genres = [x.split('|') for x in movies_df['genres'].values]

In [None]:
all_genres[:5]

In [None]:
import itertools
genres = list(set(list(itertools.chain(*all_genres))))
print(len(all_genres))
print(len(genres))
print(genres)

In [None]:
genres_df = pd.DataFrame(columns=genres, index=movies_df.index)
genres_df.head()

In [None]:
ct = 0
for i, row in genres_df.iterrows():
  movie_id = row.name
  list_of_genres = movies_df.loc[movie_id]['genres'].split('|')
  genres_df.loc[movie_id][list_of_genres] = 1


In [None]:
genres_df.head()

In [None]:
genres_df = genres_df.fillna(0)
genres_df['num_genres'] = genres_df.sum(axis=1)
print(genres_df.shape)
print(genres_df)

* get dummies 활용하기

In [None]:
genres_df = movies_df['genres'].str.get_dummies(sep='|')
genres_df.head()

In [None]:
movies_df = pd.concat([movies_df, genres_df], axis=1)
movies_df.head()

In [None]:
movies_df.drop('genres', axis=1, inplace=True)

In [None]:
movies_df.head()

## 특정 장르의 평점과 user 분석

In [None]:
movies_df.columns

In [None]:
movieId_list = movies_df['title'][movies_df.Animation == 1]
movieId_list.index[:5]

In [None]:
animation_df = ratings_df[ratings_df['movieId'].isin(movieId_list.index)]
animation_df.head()

In [None]:
animation_df.groupby('userId')['rating'].mean()

## 장르간 상관관계

In [None]:
genres_df.corr()

In [None]:
plt.figure(figsize=(40, 40))
sns.heatmap(genres_df.corr(), annot=True)

## 영화 이름, 연도 분석

In [None]:
movies_df.head()

In [None]:
title_df = movies_df.copy()
title_df.copy()

In [None]:
title_df['year'] = title_df['title'].str.extract('(\(\d\d\d\d\))')
title_df.head()

In [None]:
title_df.isna().sum()

In [None]:
title_df[title_df['year'].isnull()]

In [None]:
# 연도가 없는 영화 또는 no genres listed라고 표시된 영화는 드랍할 수 있다.
title_df[title_df['(no genres listed)'] == 1].shape

In [None]:
title_df.dropna(axis=0, inplace=True)
title_df['year'] = title_df['year'].apply(lambda x: x.replace('(', '').replace(')',''))
title_df.head()

## 연도별 영화 데이터 분석

In [None]:
year_freq_df = title_df.groupby('year')['title'].count()

In [None]:
year_freq_df

In [None]:
year_freq_df.sort_values(ascending=False)

In [None]:
year_freq_df.describe()

## 영화와 개봉연도, 그리고 평점

In [None]:
movies_df.head()

In [None]:
title_df

In [None]:
title_df[title_df['year']=='2017']

In [None]:
# 2017년 영화 평점 평균
ratings_df['rating'][ratings_df['movieId'].isin(title_df[title_df['year'] == '2017'].index)].mean()

In [None]:
ratings_df.head()

In [None]:
results = []
for year in title_df['year'].unique():
  avg_ratings = ratings_df['rating'][ratings_df['movieId'].isin(title_df[title_df['year'] == str(year)].index)].mean()
  results.append((year, avg_ratings))
results_df = pd.DataFrame(results, columns=['year', 'avg_ratings'])
results_df.sort_values(by='year')

In [None]:
results_df.hist()