In [1]:
import requests
import pandas as pd
import concurrent.futures

In [2]:
movies_rating = pd.read_csv('./dataset/movie-rating.csv')
links = pd.read_csv('./dataset/links.csv')

In [3]:
print(movies_rating.columns)
print(movies_rating.shape)

Index(['movieId', 'title', 'Year', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)', 'rating'],
      dtype='object')
(9730, 24)


In [4]:
links = links.drop(['imdbId'], axis=1)
print(links.columns)
print(links.shape)

Index(['movieId', 'tmdbId'], dtype='object')
(9742, 2)


In [5]:
df = pd.merge(movies_rating, links, on='movieId', how='left')
print(df.columns)
print(df.shape)

Index(['movieId', 'title', 'Year', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)', 'rating', 'tmdbId'],
      dtype='object')
(9730, 25)


In [6]:
# Get Additional Info from links.csv using tmdbId
df['vote_average'] = 0
df['vote_count'] = 0
df['popularity'] = 0
df['revenue'] = 0
df['budget'] = 0
df['runtime'] = 0

counter = 0

# 定义API的URL
api_url = "https://api.themoviedb.org/3/movie/"

# 定义你的API密钥
api_key = "32383af56afa244dbcd45da180ada9d3"

def fetch_movie_data(i):
    global counter
    
    # Define the movie ID you want to query
    movie_id = df.loc[i, 'tmdbId']

    # Use a GET request to call the API and get the JSON response
    response = requests.get(api_url + str(movie_id) + "?api_key=" + api_key)
    data = response.json()

    # Update the DataFrame with the new data
    df.loc[i, 'vote_average'] = data["vote_average"]
    df.loc[i, 'vote_count'] = data["vote_count"]
    df.loc[i, 'popularity'] = data["popularity"]
    df.loc[i, 'revenue'] = data["revenue"]
    df.loc[i, 'budget'] = data["budget"]
    df.loc[i, 'runtime'] = data["runtime"]
    
    counter += 1
    print(f"Loop has run {counter} times")

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(fetch_movie_data, range(len(df)))

Loop has run 1 times
Loop has run 2 times
Loop has run 3 times
Loop has run 4 times
Loop has run 5 times
Loop has run 6 times
Loop has run 7 times
Loop has run 8 times
Loop has run 9 times
Loop has run 10 times
Loop has run 11 times
Loop has run 12 times
Loop has run 13 times
Loop has run 14 times
Loop has run 15 times
Loop has run 16 times
Loop has run 17 times
Loop has run 18 times
Loop has run 19 times
Loop has run 20 times
Loop has run 21 times
Loop has run 22 times
Loop has run 23 times
Loop has run 24 times
Loop has run 25 times
Loop has run 26 times
Loop has run 27 times
Loop has run 28 times
Loop has run 29 times
Loop has run 30 times
Loop has run 31 times
Loop has run 32 times
Loop has run 33 times
Loop has run 34 times
Loop has run 35 times
Loop has run 36 times
Loop has run 37 times
Loop has run 38 times
Loop has run 39 times
Loop has run 40 times
Loop has run 41 times
Loop has run 42 times
Loop has run 43 times
Loop has run 44 times
Loop has run 45 times
Loop has run 46 tim

In [7]:
print(df.columns)
print(df.shape)

Index(['movieId', 'title', 'Year', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)', 'rating', 'tmdbId',
       'vote_average', 'vote_count', 'popularity', 'revenue', 'budget',
       'runtime'],
      dtype='object')
(9730, 31)


In [8]:
print(df.head(1))

   movieId             title  Year  Adventure  Animation  Children  Comedy  \
0        1  Toy Story (1995)  1995          1          1         1       1   

   Fantasy  Romance  Drama  ...  Film-Noir  (no genres listed)   rating  \
0        1        0      0  ...          0                   0  3.92093   

   tmdbId  vote_average  vote_count  popularity    revenue    budget  runtime  
0   862.0         7.971       17363     103.992  394400000  30000000       81  

[1 rows x 31 columns]


In [9]:
# Drop the NA data
df = df.dropna()
print(df.isnull().sum())
print(df.shape)

movieId               0
title                 0
Year                  0
Adventure             0
Animation             0
Children              0
Comedy                0
Fantasy               0
Romance               0
Drama                 0
Action                0
Crime                 0
Thriller              0
Horror                0
Mystery               0
Sci-Fi                0
War                   0
Musical               0
Documentary           0
IMAX                  0
Western               0
Film-Noir             0
(no genres listed)    0
rating                0
tmdbId                0
vote_average          0
vote_count            0
popularity            0
revenue               0
budget                0
runtime               0
dtype: int64
(9723, 31)


In [10]:
# move rating to the last for target
rating = df.pop('rating')
df.insert(len(df.columns), 'rating', rating)
print(df.columns)

Index(['movieId', 'title', 'Year', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX',
       'Western', 'Film-Noir', '(no genres listed)', 'tmdbId', 'vote_average',
       'vote_count', 'popularity', 'revenue', 'budget', 'runtime', 'rating'],
      dtype='object')


In [11]:
# Check for data correlation with rating
correlation_matrix = df.corr()
print(correlation_matrix['rating'])

movieId               0.029186
Adventure            -0.019107
Animation             0.069290
Children             -0.045718
Comedy               -0.069110
Fantasy              -0.013911
Romance               0.048950
Drama                 0.158548
Action               -0.088569
Crime                 0.015451
Thriller             -0.060658
Horror               -0.128383
Mystery               0.021269
Sci-Fi               -0.057195
War                   0.070423
Musical               0.006512
Documentary           0.125603
IMAX                  0.008164
Western               0.019133
Film-Noir             0.035661
(no genres listed)    0.001040
tmdbId               -0.010445
vote_average          0.379750
vote_count            0.093503
popularity            0.001007
revenue               0.003279
budget               -0.086770
runtime               0.069438
rating                1.000000
Name: rating, dtype: float64


  correlation_matrix = df.corr()


In [13]:
# Save additonal dataframe
df.to_csv('./dataset/movie-rating-additional.csv', index=False)