# Installations

In [None]:
%pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Using cached pandas-3.0.0-cp314-cp314-win_amd64.whl.metadata (19 kB)
Collecting numpy>=2.3.3 (from pandas)
  Using cached numpy-2.4.2-cp314-cp314-win_amd64.whl.metadata (6.6 kB)
Collecting tzdata (from pandas)
  Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-3.0.0-cp314-cp314-win_amd64.whl (9.9 MB)
Using cached numpy-2.4.2-cp314-cp314-win_amd64.whl (12.4 MB)
Using cached tzdata-2025.3-py2.py3-none-any.whl (348 kB)
Installing collected packages: tzdata, numpy, pandas

   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import numpy as np
print(f"pd version:{pd.__version__}")
print(f"np version:{np.__version__}")

3.0.0
2.4.2


# Load and display files

In [None]:

#load the files
movies_file = pd.read_csv("data/movies.csv")
links_file = pd.read_csv("data/links.csv")
ratings_file = pd.read_csv("data/ratings.csv")
tags_file = pd.read_csv("data/tags.csv")

#print first 5 rows
movies_file.head()
links_file.head()
ratings_file.head()
tags_file.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Restructure Dataset


### genre csv

In [46]:
genres_col = movies_file["genres"].str.split("|").explode().unique()
genre_data = {
    "genreId":range(len(genres_col)),
    "genre": genres_col
}
genre_df = pd.DataFrame(data=genre_data)
genre_file = genre_df.to_csv("data/genre.csv", index=False)


### movie_genres csv

In [None]:
movie_genres = (
    movies_file[["movieId", "genres"]]
    .assign(genre = movies_file["genres"].str.split("|"))
    .explode("genre")
)

movie_genre_df = movie_genres.merge(
    genre_df,
    left_on="genre",
    right_on="genre",
    how="left"
)

# ✅ Keep the three columns you want (using the *actual* column names)
movie_genre_df = movie_genre_df[["movieId", "genreId", "genre"]]

# (Optional) — only if you want lowercase movieid in the final file
movie_genre_df.rename(columns={"movieId": "movieid"}, inplace=True)

movie_genres_file = movie_genre_df.to_csv("data/movie_genre.csv", index=False)


### drop cols

In [51]:
movies_file = movies_file.drop(columns=["genres"])
movies_file.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


### change data types

In [65]:
genre_files = pd.read_csv("data/genre.csv")
movie_genre_files = pd.read_csv("data/movie_genre.csv")

links_file["tmdbId"] = links_file["tmdbId"].astype("Int64")
ratings_file["timestamp"] = pd.to_datetime(ratings_file["timestamp"], unit="s")
tags_file["timestamp"] = pd.to_datetime(tags_file["timestamp"], unit="s")

 

   userId  movieId  rating           timestamp
0       1        1     4.0 2000-07-30 18:45:03
1       1        3     4.0 2000-07-30 18:20:47
2       1        6     4.0 2000-07-30 18:37:04
3       1       47     5.0 2000-07-30 19:03:35
4       1       50     5.0 2000-07-30 18:48:51


### split timestamp into date and time


In [None]:
ratings_file["date"] = ratings_file["timestamp"].dt.date
ratings_file["time"] = ratings_file["timestamp"].dt.time

tags_file["date"] = tags_file["timestamp"].dt.date
tags_file["time"] = tags_file["timestamp"].dt.time

tags_file = tags_file.drop(columns=["timestamp"])
ratings_file = ratings_file.drop(columns=["timestamp"])


KeyError: 'timestamp'

### remove nulls

In [90]:
links_file = links_file.dropna()
links_file.isna().sum()


movieId    0
imdbId     0
tmdbId     0
dtype: int64

### remove duplicates

In [None]:
links_file.drop_duplicates(inplace=True)
links_file.duplicated().sum()

#make letters lowercase in
tags_file["tag"].str.lower()

0                  funny
1        highly quotable
2           will ferrell
3           boxing story
4                    mma
              ...       
3678           for katie
3679             austere
3680              gun fu
3681    heroic bloodshed
3682    heroic bloodshed
Name: tag, Length: 3683, dtype: str

### save files


In [114]:
tags_file.to_csv("data/tags.csv", index=False)
ratings_file.to_csv("data/ratings.csv", index=False)
movies_file.to_csv("data/movies.csv", index=False)
links_file.to_csv("data/links.csv", index=False)