In [7]:
import pandas as pd
import os
import os.path as osp
from IPython.display import display 
import numpy as np

In [8]:
def change_to_root_location() -> None:
    current_path = os.getcwd()
    if "pyproject.toml" in os.listdir(current_path) :
        print("Already in root location: ", current_path)
    else:
        os.chdir("..")
        change_to_root_location()

# Make sure that this notebook is located at the root of the project
change_to_root_location()

Already in root location:  /home/sebastiangarcia/Documents/u/8/bd-lab/semana-10/ingesta_datos


In [3]:
# Read movies.data file into a DataFrame
DATA_FOLDER = "data/ml-100k"
movies_path = osp.join(DATA_FOLDER, "u.item")
user_path = osp.join(DATA_FOLDER, "u.user")
ratings_path = osp.join(DATA_FOLDER, "u.data")

genres_columns = [
    'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
    'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
    'Thriller', 'War', 'Western'
]

movies_df = pd.read_csv(movies_path, sep='|', encoding='latin-1', header=None,
                       names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                              'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
                              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                              'Thriller', 'War', 'Western'])
movies_df[genres_columns] = movies_df[genres_columns].astype(np.int8)
# Read users.data file into a DataFrame
users_df = pd.read_csv(user_path, sep='|', encoding='latin-1', header=None,
                      names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# Read ratings.data file into a DataFrame
ratings_df = pd.read_csv(ratings_path, sep='\t', encoding='latin-1', header=None,
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Display the first few rows of each DataFrame
print('Movies DataFrame:')
display(movies_df.head())
print('\nUsers DataFrame:')
display(users_df.head())
print('\nRatings DataFrame:')
display(ratings_df.head())

Movies DataFrame:


Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



Users DataFrame:


Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213



Ratings DataFrame:


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   title               1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   IMDb_URL            1679 non-null   object 
 5   unknown             1682 non-null   int8   
 6   Action              1682 non-null   int8   
 7   Adventure           1682 non-null   int8   
 8   Animation           1682 non-null   int8   
 9   Children            1682 non-null   int8   
 10  Comedy              1682 non-null   int8   
 11  Crime               1682 non-null   int8   
 12  Documentary         1682 non-null   int8   
 13  Drama               1682 non-null   int8   
 14  Fantasy             1682 non-null   int8   
 15  Film-Noir           1682 non-null   int8   
 16  Horror

In [5]:
movies_df['genre'] = movies_df[genres_columns].apply(lambda x: ','.join(x[x==1].index), axis=1)

# Drop the individual genre columns
movies_df = movies_df.drop(columns=genres_columns)

In [6]:
movies_df

Unnamed: 0,movie_id,title,release_date,video_release_date,IMDb_URL,genre
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"Animation,Children,Comedy"
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"Action,Adventure,Thriller"
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,Thriller
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"Action,Comedy,Drama"
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),"Crime,Drama,Thriller"
...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,Drama
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,"Romance,Thriller"
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),"Drama,Romance"
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,Comedy


In [None]:
import typing as ty

def read_dataset(data_folder: str) -> ty.Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    movies_path = osp.join(data_folder, "u.item")
    user_path = osp.join(data_folder, "u.user")
    ratings_path = osp.join(data_folder, "u.data")

    genres_columns = [
        'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
        'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
        'Thriller', 'War', 'Western'
    ]

    movies_df = pd.read_csv(movies_path, sep='|', encoding='latin-1', header=None,
                           names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                                  'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
                                  'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                                  'Thriller', 'War', 'Western'])
    movies_df[genres_columns] = movies_df[genres_columns].astype(np.int8)
    # Read users.data file into a DataFrame
    users_df = pd.read_csv(user_path, sep='|', encoding='latin-1', header=None,
                          names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

    # Read ratings.data file into a DataFrame
    ratings_df = pd.read_csv(ratings_path, sep='\t', encoding='latin-1', header=None,
                            names=['user_id', 'movie_id', 'rating', 'timestamp'])

    # Display the first few rows of each DataFrame
    print('Movies DataFrame:')
    display(movies_df.head())
    print('\nUsers DataFrame:')
    display(users_df.head())
    print('\nRatings DataFrame:')
    display(ratings_df.head())
    return movies_df, users_df, ratings_df

movies_df, users_df, ratings_df = read_dataset(DATA_FOLDER)