In [46]:
import pandas as pd

tmdb = pd.read_csv('TMDB_movie_dataset_v11.csv')
imdb = pd.read_csv('imdb_preprocessed.csv')

In [47]:
# Select the desired columns from TMDB
tmdb_subset = tmdb[['id','imdb_id', 'overview', 'release_date', 'poster_path', 'backdrop_path', 'keywords', 'production_countries']]

# Rename 'imdb_id' to 'tconst' for consistency
tmdb_subset = tmdb_subset.rename(columns={'imdb_id': 'tconst'})

# Convert 'id' column to string type
tmdb_subset['id'] = tmdb_subset['id'].astype(str)

# Merge the datasets
merged_df = imdb.merge(tmdb_subset, on='tconst', how='left')
merged_df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,runtimeMinutes,genres,directors,averageRating,numVotes,actors,director_names,actor_names,id,overview,release_date,poster_path,backdrop_path,keywords,production_countries
0,tt0111161,The Shawshank Redemption,The Shawshank Redemption,142.0,Drama,nm0001104,9.3,2934011.0,"nm0000151,nm0348409,nm0000209",Frank Darabont,"Morgan Freeman,Bob Gunton,Tim Robbins",278,Framed in the 1940s for the double murder of h...,1994-09-23,/lyQBXzOQSuE59IsHyhrp0qIiPAz.jpg,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,"prison, friendship, police brutality, corrupti...",United States of America
1,tt0468569,The Dark Knight,The Dark Knight,152.0,"Action,Crime,Drama",nm0634240,9.0,2914377.0,"nm0000288,nm0005132",Christopher Nolan,"Christian Bale,Heath Ledger",155,Batman raises the stakes in his war on crime. ...,2008-07-16,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,"joker, sadism, chaos, secret identity, crime f...","United Kingdom, United States of America"
2,tt1375666,Inception,Inception,148.0,"Action,Adventure,Sci-Fi",nm0634240,8.8,2587898.0,"nm0330687,nm0000138,nm0680983",Christopher Nolan,"Joseph Gordon-Levitt,Leonardo DiCaprio,Elliot ...",27205,"Cobb, a skilled thief who commits corporate es...",2010-07-15,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,"rescue, mission, dream, airplane, paris, franc...","United Kingdom, United States of America"
3,tt0137523,Fight Club,Fight Club,139.0,Drama,nm0000399,8.8,2365490.0,"nm0000093,nm0001533,nm0001570",David Fincher,"Brad Pitt,Meat Loaf,Edward Norton",550,A ticking-time-bomb insomniac and a slippery s...,1999-10-15,/pB8BM7pdSp6B6Ih7QZ4DrQ3PmJK.jpg,/hZkgoQYus5vegHoetLkCJzb17zJ.jpg,"dual identity, rage and hate, based on novel o...",United States of America
4,tt0109830,Forrest Gump,Forrest Gump,142.0,"Drama,Romance",nm0000709,8.8,2294581.0,"nm0000158,nm0000641",Robert Zemeckis,"Tom Hanks,Gary Sinise",13,A man with a low IQ has accomplished great thi...,1994-06-23,/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg,/qdIMHd4sEfJSckfVJfKQvisL02a.jpg,"vietnam war, vietnam veteran, mentally disable...",United States of America


In [48]:
def preprocess_feature(feature):
    if pd.isna(feature):
        return ''
    elif isinstance(feature, str):
        return ' '.join(feature.split(','))
    else:
        return feature

for column in merged_df.select_dtypes(include=['object']).columns:
    merged_df[column] = merged_df[column].apply(preprocess_feature)

# Combine features
merged_df['content'] = merged_df['overview'] + ' ' + (merged_df['keywords'] + ' ') * 3 + ' ' + (merged_df['directors'] + ' ') * 5 + ' ' + (merged_df['genres'] + ' ') * 3 + ' ' + (merged_df['actors'] + ' ') * 2 + ' ' + merged_df['production_countries']

# Display info about the merged dataset
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 409412 entries, 0 to 409411
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tconst                409412 non-null  object 
 1   primaryTitle          409412 non-null  object 
 2   originalTitle         409412 non-null  object 
 3   runtimeMinutes        409412 non-null  float64
 4   genres                409412 non-null  object 
 5   directors             409412 non-null  object 
 6   averageRating         277705 non-null  float64
 7   numVotes              277705 non-null  float64
 8   actors                409412 non-null  object 
 9   director_names        409412 non-null  object 
 10  actor_names           409412 non-null  object 
 11  id                    409412 non-null  object 
 12  overview              409412 non-null  object 
 13  release_date          409412 non-null  object 
 14  poster_path           409412 non-null  object 
 15  

In [49]:
merged_df.rename(columns={'primaryTitle': 'title'}, inplace=True)

# Create our filtering condition
condition = (
    (merged_df['numVotes'] > 20000) | 
    ((merged_df['production_countries'].str.contains('Argentina', case=False, na=False)) & 
     (merged_df['numVotes'] > 10000))
)

# Apply the filter
filtered_df = merged_df[condition]
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7584 entries, 0 to 11064
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   tconst                7584 non-null   object 
 1   title                 7584 non-null   object 
 2   originalTitle         7584 non-null   object 
 3   runtimeMinutes        7584 non-null   float64
 4   genres                7584 non-null   object 
 5   directors             7584 non-null   object 
 6   averageRating         7584 non-null   float64
 7   numVotes              7584 non-null   float64
 8   actors                7584 non-null   object 
 9   director_names        7584 non-null   object 
 10  actor_names           7584 non-null   object 
 11  id                    7584 non-null   object 
 12  overview              7584 non-null   object 
 13  release_date          7584 non-null   object 
 14  poster_path           7584 non-null   object 
 15  backdrop_path         758

In [50]:
# Save the filtered dataset to a CSV file without the index
filtered_df.to_csv('../dataset.csv', index=False)

print("Filtered dataset saved")


Filtered dataset saved to 'filtered_movies.csv'
