In [7]:
import pandas as pd

movies_path = 'data/movies.dat'
ratings_path = 'data/ratings.dat'
users_path = 'data/users.dat'

In [8]:
# Attempt to load the datasets with ISO-8859-1 encoding to handle any non-UTF-8 characters
movies_columns = ['MovieID', 'Title', 'Genres']
ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
users_columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

def load_data(file_path, columns, encoding='ISO-8859-1'):
    return pd.read_csv(file_path, sep='::', engine='python', names=columns, encoding=encoding)

movies_df = load_data(movies_path, movies_columns)
ratings_df = load_data(ratings_path, ratings_columns)
users_df = load_data(users_path, users_columns)

movies_df.head(), ratings_df.head(), users_df.head()


(   MovieID                               Title                        Genres
 0        1                    Toy Story (1995)   Animation|Children's|Comedy
 1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
 2        3             Grumpier Old Men (1995)                Comedy|Romance
 3        4            Waiting to Exhale (1995)                  Comedy|Drama
 4        5  Father of the Bride Part II (1995)                        Comedy,
    UserID  MovieID  Rating  Timestamp
 0       1     1193       5  978300760
 1       1      661       3  978302109
 2       1      914       3  978301968
 3       1     3408       4  978300275
 4       1     2355       5  978824291,
    UserID Gender  Age  Occupation Zip-code
 0       1      F    1          10    48067
 1       2      M   56          16    70072
 2       3      M   25          15    55117
 3       4      M   45           7    02460
 4       5      M   25          20    55455)

In [9]:
# Extract unique genres from the movies dataset
unique_genres = set()
movies_df['Genres'].str.split('|').apply(unique_genres.update)

# Expanding the genres in the movies dataframe into separate binary columns
movies_genres_expanded = movies_df.copy()
for genre in unique_genres:
    movies_genres_expanded[genre] = movies_genres_expanded['Genres'].apply(lambda x: genre in x).astype(int)

# Merging the expanded movies dataframe with the ratings dataframe
ratings_movies_merged = pd.merge(ratings_df, movies_genres_expanded, on='MovieID')

# Finally, merging the above with the users dataframe to get all user info
full_dataset = pd.merge(ratings_movies_merged, users_df, on='UserID')

# Selecting columns to view the structure clearly, including all genres
base_columns = ['UserID', 'MovieID', 'Rating', 'Gender', 'Age', 'Occupation']
genre_columns = list(unique_genres)
selected_columns = base_columns + genre_columns


full_dataset[selected_columns].shape
full_dataset = full_dataset.drop(['Genres', 'Timestamp', 'Zip-code'], axis = 1)


In [10]:
output_file_path = 'data/full_dataset.csv'
full_dataset.to_csv(output_file_path, index=False)

