In [1]:
import pandas as pd

## Movies

In [2]:
genres = pd.read_csv("ml-32m/movies.csv")

In [3]:
genres.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
links = pd.read_csv("ml-32m/links.csv")
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
merge_1 = pd.merge(genres, links, on='movieId')
merge_1.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0


In [6]:
len(merge_1)

87585

In [7]:
directors = pd.read_json("genome/metadata_updated.jsonl", lines=True)
directors.head()

Unnamed: 0,title,directedBy,starring,avgRating,imdbId,item_id
0,Toy Story (1995),John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709,1
1,Jumanji (1995),Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497,2
2,Grumpier Old Men (1995),Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",3.17146,113228,3
3,Waiting to Exhale (1995),Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",2.86824,114885,4
4,Father of the Bride Part II (1995),Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041,5


In [8]:
len(directors)

84661

In [9]:
merged_2 = pd.merge(directors, merge_1, left_on='item_id', right_on='movieId', how='left')

# Drop the duplicate key column if necessary

merged_2.drop(columns=['movieId', 'title_y', 'tmdbId','imdbId_y'], inplace=True)
merged_2.rename(columns={'title_x':'title', 'imdbId_x':'imdbId'}, inplace=True)
columns = ['item_id', 'title', 'genres', 'directedBy', 'starring', 'avgRating', 'imdbId']
merged_2 = merged_2[columns]
merged_2.reset_index(drop=True, inplace=True)
merged_2.head()

Unnamed: 0,item_id,title,genres,directedBy,starring,avgRating,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,John Lasseter,"Tim Allen, Tom Hanks, Don Rickles, Jim Varney,...",3.89146,114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,Joe Johnston,"Jonathan Hyde, Bradley Pierce, Robin Williams,...",3.26605,113497
2,3,Grumpier Old Men (1995),Comedy|Romance,Howard Deutch,"Jack Lemmon, Walter Matthau, Ann-Margret , Sop...",3.17146,113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Forest Whitaker,"Angela Bassett, Loretta Devine, Whitney Housto...",2.86824,114885
4,5,Father of the Bride Part II (1995),Comedy,Charles Shyer,"Steve Martin, Martin Short, Diane Keaton, Kimb...",3.0762,113041


In [10]:
len(merged_2)

84661

In [11]:
merged_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84661 entries, 0 to 84660
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   item_id     84661 non-null  int64  
 1   title       84661 non-null  object 
 2   genres      73476 non-null  object 
 3   directedBy  84661 non-null  object 
 4   starring    84661 non-null  object 
 5   avgRating   84661 non-null  float64
 6   imdbId      84661 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 4.5+ MB


In [12]:
# Filter rows where the 'genres' column is empty or null
no_genres_df = merged_2[merged_2['genres'].isnull() | (merged_2['genres'] == '')]

# Display the rows without genres
print(no_genres_df)

       item_id                                              title genres  \
288        291                               Poison Ivy II (1996)    NaN   
541        545                                       Harem (1985)    NaN   
617        624              Condition Red (Beyond the Law) (1995)    NaN   
871        888  Land Before Time III: The Time of the Great Gi...    NaN   
1044      1064             Aladdin and the King of Thieves (1996)    NaN   
...        ...                                                ...    ...   
84653   239300               The Grand Knockout Tournament (1987)    NaN   
84654   239302                                   Basements (1987)    NaN   
84655   239304                              Grease Day USA (1978)    NaN   
84657   239308                                   Nightmare (1974)    NaN   
84659   239312                                   Pee Nak 2 (2020)    NaN   

                         directedBy  \
288                   Anne Goursaud   
541      

In [14]:
merged_2.to_json('genome/movies.jsonl', orient='records', index=False, lines=True)

In [15]:
from sqlalchemy import create_engine

In [16]:
engine = create_engine('postgresql://admin:admin@localhost:5432/movielens')

In [17]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x13d3ae690>

In [18]:
merged_2.to_sql('movies', con=engine, if_exists='replace', index=False)

661

## Ratings

In [None]:
ratings = pd.read_json('genome/ratings.jsonl', nrows=100, lines=True)
ratings.head()

Unnamed: 0,item_id,user_id,rating
0,5,997206,3
1,10,997206,4
2,13,997206,4
3,17,997206,5
4,21,997206,4


In [None]:
ratings = pd.read_json('genome/ratings.jsonl', lines=True, chunksize=100000)

In [None]:
from time import time

In [None]:
# Step 1: Read the first chunk to get the schema
first_chunk = next(pd.read_json('genome/ratings.jsonl', lines=True, chunksize=100000))

# Step 2: Create an empty table based on the schema
first_chunk.head(0).to_sql(name='ratings', con=engine, if_exists='replace', index=False)

# Step 3: Write the first chunk and the remaining chunks to the table
first_chunk.to_sql(name='ratings', con=engine, if_exists='append', index=False)

# Step 4: Iterate over the rest of the chunks and insert them into the database
for chunk in pd.read_json('genome/ratings.jsonl', lines=True, chunksize=100000):
    t_start = time()
    chunk.to_sql(name='ratings', con=engine, if_exists='append', index=False)
    t_end = time()
    print(f'Processed chunk in {t_end - t_start} seconds')

Processed chunk in 0.9928138256072998 seconds
Processed chunk in 1.1722221374511719 seconds
Processed chunk in 1.1077780723571777 seconds
Processed chunk in 1.24397611618042 seconds
Processed chunk in 1.0509638786315918 seconds
Processed chunk in 1.050959825515747 seconds
Processed chunk in 1.22330904006958 seconds
Processed chunk in 1.0011777877807617 seconds
Processed chunk in 1.176987886428833 seconds
Processed chunk in 0.9669339656829834 seconds
Processed chunk in 1.1732351779937744 seconds
Processed chunk in 1.1130168437957764 seconds
Processed chunk in 1.111896276473999 seconds
Processed chunk in 1.1184849739074707 seconds
Processed chunk in 1.046431064605713 seconds
Processed chunk in 1.0775938034057617 seconds
Processed chunk in 1.0706291198730469 seconds
Processed chunk in 1.041321039199829 seconds
Processed chunk in 1.1113159656524658 seconds
Processed chunk in 1.0690159797668457 seconds
Processed chunk in 1.135908842086792 seconds
Processed chunk in 0.9605891704559326 second

## Reviews

In [None]:
reviews = pd.read_json('genome/reviews.jsonl', nrows=100, lines=True)
reviews.head()

Unnamed: 0,item_id,txt
0,172063,"one-shot record of a belly dancer; ""Carmencita..."
1,95541,Banging Away..; Thomas Edison and William K.L....
2,7065,unbelievable; I cannot understand how anyone c...
3,3739,I'm still starry-eyed from it; I saw this last...
4,1562,Failed on every Front; Joel Schumacher who did...


In [None]:
reviews = pd.read_json('genome/reviews.jsonl', lines=True, chunksize=100000)

In [None]:
# Step 1: Read the first chunk to get the schema
first_chunk = next(pd.read_json('genome/reviews.jsonl', lines=True, chunksize=100000))

# Step 2: Create an empty table based on the schema
first_chunk.head(0).to_sql(name='reviews', con=engine, if_exists='replace', index=False)

# Step 3: Write the first chunk and the remaining chunks to the table
first_chunk.to_sql(name='reviews', con=engine, if_exists='append', index=False)

i = 1
# Step 4: Iterate over the rest of the chunks and insert them into the database
for chunk in pd.read_json('genome/reviews.jsonl', lines=True, chunksize=100000):
    t_start = time()
    chunk.to_sql(name='reviews', con=engine, if_exists='append', index=False)
    t_end = time()
    print(f'Processed chunk {i} in {t_end - t_start} seconds')
    i += 1

Processed chunk 1 in 7.306144952774048 seconds
Processed chunk 2 in 6.557629108428955 seconds
Processed chunk 3 in 6.185736417770386 seconds
Processed chunk 4 in 6.3691511154174805 seconds
Processed chunk 5 in 6.723142147064209 seconds
Processed chunk 6 in 6.844648122787476 seconds
Processed chunk 7 in 7.170928716659546 seconds
Processed chunk 8 in 9.170273780822754 seconds
Processed chunk 9 in 8.888615846633911 seconds
Processed chunk 10 in 8.703731060028076 seconds
Processed chunk 11 in 11.276880979537964 seconds
Processed chunk 12 in 8.47710108757019 seconds
Processed chunk 13 in 8.52095913887024 seconds
Processed chunk 14 in 8.912200927734375 seconds
Processed chunk 15 in 8.622588872909546 seconds
Processed chunk 16 in 8.540123701095581 seconds
Processed chunk 17 in 8.628298044204712 seconds
Processed chunk 18 in 8.014307975769043 seconds
Processed chunk 19 in 7.593925952911377 seconds
Processed chunk 20 in 7.354861259460449 seconds
Processed chunk 21 in 6.721498727798462 seconds
P

## Survey Answers

In [None]:
survey_answers = pd.read_json('genome/survey_answers.jsonl', lines=True)
survey_answers.head()

Unnamed: 0,user_id,item_id,tag_id,score
0,978707,3108,50126,3
1,978707,2858,50126,1
2,978707,1269,50126,1
3,978707,1136,50126,1
4,978707,1220,50126,1


In [None]:
survey_answers.nunique()

user_id     679
item_id    5546
tag_id     1094
score         6
dtype: int64

678/247383 user_id -> cannot represent 

## Tag count

In [None]:
tag_counts = pd.read_json('genome/tag_count.jsonl', lines=True)
tag_counts.head()

Unnamed: 0,item_id,tag_id,num
0,1,86963,4
1,1,42940,1
2,1,37116,26
3,1,52206,1
4,1,34442,21


In [None]:
tag_counts.nunique()

item_id    39685
tag_id      1094
num          258
dtype: int64