### add isAdult, runtimeMinutes and regions properties to neo4j

In [17]:
import pandas as pd

In [18]:
df = pd.read_csv("dataset/unprocessed/title.basics.tsv", sep = '\t')

  df = pd.read_csv("dataset/unprocessed/title.basics.tsv", sep = '\t')


In [19]:
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [9]:
movies_df = df[df['titleType'] == 'movie'][['tconst', 'isAdult', 'runtimeMinutes']]

# Rename 'tconst' to 'movie_id'
movies_df = movies_df.rename(columns={'tconst': 'movie_id'})

In [15]:
movies_df.tail()

Unnamed: 0,movie_id,isAdult,runtimeMinutes
11612597,tt9916622,0,57
11612624,tt9916680,0,100
11612636,tt9916706,0,\N
11612646,tt9916730,0,116
11612656,tt9916754,0,49


In [11]:
movies_df.shape

(712961, 3)

In [20]:
df_akas = pd.read_csv("dataset/unprocessed/title.akas.tsv", sep = '\t')

In [27]:
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,\N,\N,original,\N,1
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita,US,\N,imdbDisplay,\N,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
4,tt0000001,5,Καρμενσίτα,GR,\N,imdbDisplay,\N,0


In [32]:
# Keep only titleId and region columns
df_akas_new = df_akas[['titleId', 'region']]

# Remove rows with null regions
df_akas_new = df_akas_new[df_akas_new['region'].notnull()]

# Group regions into a list per movie
region_df = df_akas_new.groupby('titleId')['region'].apply(lambda x: sorted(set(x))).reset_index()
region_df.columns = ['movie_id', 'regions']

In [37]:
region_df = df_akas_new.groupby('titleId')['region'].apply(lambda x: sorted(set(x) - {'\\N'})).reset_index()
region_df.columns = ['movie_id', 'regions']

In [44]:
region_df.head()

Unnamed: 0,movie_id,regions
0,tt0000001,"[DE, GR, HU, JP, RU, UA, US]"
1,tt0000002,"[DE, FR, HU, JP, RO, RU, US]"
2,tt0000003,"[DE, FR, GB, HU, JP, RO, RU, UA, US]"
3,tt0000004,"[DE, FR, HU, JP, RO, RU]"
4,tt0000005,"[CA, DE, GB, HU, RU, UA, US]"


In [40]:
region_df.shape

(11589688, 2)

In [41]:
combined_df = pd.merge(movies_df, region_df, on='movie_id', how='left')

In [42]:
combined_df.head()

Unnamed: 0,movie_id,isAdult,runtimeMinutes,regions
0,tt0000009,0,45,"[AU, DE, HU, US]"
1,tt0000147,0,100,"[RU, US]"
2,tt0000502,0,100,[ES]
3,tt0000574,0,70,"[AU, DE, GB, HU, RS, SG, US]"
4,tt0000591,0,90,"[FR, US]"


In [45]:
combined_df.shape

(712961, 4)

### Import to neo4j

In [47]:
from py2neo import Graph
from tqdm import tqdm

graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))

import properties to neo4j batch by batch

In [51]:
def update_in_chunks(df, chunk_size=8000):
    for start in tqdm(range(0, len(df), chunk_size)):
        chunk = df.iloc[start:start+chunk_size]

        # Create list of dictionaries to pass into UNWIND
        data = [
            {"movie_id": row.movie_id, "isAdult": row.isAdult, "runtimeMinutes": row.runtimeMinutes, "regions": row.regions}
            for row in chunk.itertuples()
        ]

        # Cypher query to update existing Movie nodes
        graph.run("""
        UNWIND $rows AS row
        MATCH (m:Movie {id: row.movie_id})
        SET m.isAdult = row.isAdult, m.runtimeMinutes = row.runtimeMinutes, m.regions = row.regions
        """, parameters={"rows": data})

# Run the update
update_in_chunks(combined_df)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:19<00:00,  4.65it/s]
