## 데이터 크롤링을 위한 Notebook

##### Movie Dataset From TMDB

In [1]:
import requests
import os
import csv
import json
import pandas as pd

In [2]:
def append_data_to_dataset(df, data):
    df.loc[len(df)] = data
    return df

In [3]:
file_path = '../datasets/movie_detail_data.csv'

# 파일이 존재하는지 확인
if not os.path.exists(file_path):
    # 파일이 없으면 새로 생성
    with open(file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['adult','belongs_to_collection','budget','genres','homepage','id','imdb_id','original_language','original_title','overview','popularity','poster_path','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','title','video','vote_average','vote_count'])

movie_df = pd.read_csv(file_path)

In [5]:
year_range = range(1874, 1880)
month_range = range(1, 13)

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJiMmY1N2RmMjUzZjIyZTAwMTQ5MjRjYjdhYmI1MWIxOSIsIm5iZiI6MTcyODE4NDI5Mi44MjE4MjksInN1YiI6IjY3MDFmZTM4Zjg3OGFkZmVkMDg1ODRlMyIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.6IP5mBurL06yy5lF0FOGAoCxnxmDW6oF9p6Xp7utISM"
}

movie_array = []
for year in year_range:
    for month in month_range:
        if month == 12:
            month_add = 1
            year_add = year+1
        else:
            month_add = month+1
            year_add = year
        
        print(f'Currently Parsing in year: {year}, month:{month}')

        page = 1
        while True:
            url = f"https://api.themoviedb.org/3/discover/movie?page={page}&primary_release_date.gte={year}-{month}-02&primary_release_date.lte={year_add}-{month_add}-01&sort_by=popularity.desc"
            response_data = requests.get(url, headers=headers).json()
                #result가 더이상 없으면 while문 빠져나가야
            if len(response_data['results'])<1:
                break
            
            for item in response_data['results']:
                url_detail = f"https://api.themoviedb.org/3/movie/{item['id']}"
                json_item = requests.get(url_detail, headers=headers).json()
                movie_array.append(json_item)
                #movie_df = append_data_to_dataset(movie_df, json_item)
            page = page+1

movie_df = pd.DataFrame(movie_array)

Currently Parsing in year: 1874, month:1
Currently Parsing in year: 1874, month:2
Currently Parsing in year: 1874, month:3
Currently Parsing in year: 1874, month:4
Currently Parsing in year: 1874, month:5
Currently Parsing in year: 1874, month:6
Currently Parsing in year: 1874, month:7
Currently Parsing in year: 1874, month:8
Currently Parsing in year: 1874, month:9
Currently Parsing in year: 1874, month:10
Currently Parsing in year: 1874, month:11
Currently Parsing in year: 1874, month:12
Currently Parsing in year: 1875, month:1
Currently Parsing in year: 1875, month:2
Currently Parsing in year: 1875, month:3
Currently Parsing in year: 1875, month:4
Currently Parsing in year: 1875, month:5
Currently Parsing in year: 1875, month:6
Currently Parsing in year: 1875, month:7
Currently Parsing in year: 1875, month:8
Currently Parsing in year: 1875, month:9
Currently Parsing in year: 1875, month:10
Currently Parsing in year: 1875, month:11
Currently Parsing in year: 1875, month:12
Currently 

In [6]:
movie_df.shape

(31, 26)

In [7]:
movie_df.to_csv(file_path)

##### Rating Dataset From ????

### Problem
- 아무리 찾아봐도 MovieLens에서 제공한 최대 6만4천개의 영화 평점데이터밖에 구할수가 없다.


### Idea
 1. TMDB에서는 유저 아이디를 입력하면 해당 유저가 평가한 영화의 rating 정보를 제공한다. 
 2. 유저 아이디는 Account API에 아이디를 입력하면 유저정보가 뜨는데, 없는 아이디는 정보가 없다.
 3. 유저 아이디는 Int32형이기 때문에, 최대값이 2^32 -1이다.
 4. 따라서, 유저 아이디를 1부터 2^32-1까지 모두 넣어보면 유효한 유저 아이디 리스트를 만들 수 있고
 5. 그러면 해당 유저들이 평가한 rating 정보를 불러올 수 있지 않을까?

### What To Do?
 - Super 노가다
 - 21억 API 호출은.. 시간당 18만 API 호출이 최대이다. (초당 50호출 기준) 그러면 24시간에 최대 432만개밖에 호출 불가능.. 다른방법 찾아야되는데?
 