In [28]:
import pandas as pd

In [29]:
links = pd.read_csv('movie/links.csv')

In [30]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [31]:
movies = pd.read_csv('movie/movies.csv')

In [32]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
ratings = pd.read_csv('movie/ratings.csv')

In [34]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [35]:
tags = pd.read_csv('movie/tags.csv')

In [36]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Merge the datasets on movieId

In [37]:
movies_links = pd.merge(movies, links, on='movieId', how='left')

In [38]:
movies_links_ratings = pd.merge(movies_links, ratings, on='movieId', how='left')

In [39]:
combined_df = pd.merge(movies_links_ratings, tags, on=['userId', 'movieId'], how='left')

In [40]:
combined_df = combined_df.drop(columns=['tag', 'timestamp_y'])

In [41]:
combined_df.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,userId,rating,timestamp_x
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,17.0,4.5,1305696000.0


Handle Missing Values

In [42]:
missing = combined_df.isnull().sum()

In [43]:
print("Missing values in each column:", missing)

Missing values in each column: movieId         0
title           0
genres          0
imdbId          0
tmdbId         13
userId         18
rating         18
timestamp_x    18
dtype: int64


In [44]:
rating_mean = combined_df['rating'].mean()
timestamp_x_mean = combined_df['timestamp_x'].mean()

In [45]:
combined_df['rating'] = combined_df['rating'].fillna(rating_mean)
combined_df['timestamp_x'] = combined_df['timestamp_x'].fillna(timestamp_x_mean)

In [47]:
missing = combined_df.isnull().sum()
print("Missing values in each column:", missing)

Missing values in each column: movieId         0
title           0
genres          0
imdbId          0
tmdbId         13
userId         18
rating          0
timestamp_x     0
dtype: int64


In [50]:
combined_df = combined_df.dropna(subset=['userId','tmdbId'])

In [51]:
missing = combined_df.isnull().sum()
print("Missing values in each column:", missing)

Missing values in each column: movieId        0
title          0
genres         0
imdbId         0
tmdbId         0
userId         0
rating         0
timestamp_x    0
dtype: int64


Encode Categorical

In [52]:
combined_df['genres'] = combined_df['genres'].str.split('|')

In [53]:
genres_encoded = combined_df['genres'].str.join('|').str.get_dummies()
combined_df = pd.concat([combined_df, genres_encoded], axis=1)

In [54]:
from sklearn.preprocessing import StandardScaler

In [55]:
scaler = StandardScaler()
combined_df[['rating', 'timestamp_x']] = scaler.fit_transform(combined_df[['rating', 'timestamp_x']])

In [56]:
combined_df.to_csv('combined_movie_lens.csv', index=False)
print("Combined and preprocessed dataset saved to 'combined_movie_lens.csv'")

Combined and preprocessed dataset saved to 'combined_movie_lens.csv'
