In [78]:
import pandas as pd
import os
import re

In [79]:
# Define file directories
MOVIELENS_DIR = 'Raw_Data/'
USER_DATA_FILE = 'users.dat'
MOVIE_DATA_FILE = 'movies.dat'
RATING_DATA_FILE = 'ratings.dat'

### `MOVIE DATA`


**Input:**  
Raw movie information containing the following fields:  
- `movie_id`
- `title` (including year: Ex: Toy story (1995))
- `genres` (e.g., multiple genres separated by delimiters)

**Output:**  
The raw data is cleaned and normalized into three separate relational tables to support efficient querying and database normalization:

1. **`movie`**  
   - Fields: `movie_id`, `name`, `year` 
   - Stores basic movie metadata.

2. **`genre`**  
   - Fields: `genre_id`, `genre`  
   - Contains the unique list of movie genres.

3. **`movie_genre`**  
   - Fields: `movie_id`, `genre_id`  
   - Maps each movie to its corresponding genres (many-to-many relationship).

In [80]:
movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
movies['movie_id'] = movies['movie_id'] - 1
movies.head(5)

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [81]:
# Extract name and year from title
def extract_title_year(title):
    match = re.match(r'^(.*)\s\((\d{4})\)$', title)
    if match:
        return match.group(1), int(match.group(2))
    return title, None

movies[['name', 'year']] = movies['title'].apply(
    lambda x: pd.Series(extract_title_year(x))
)
movies['year'] = movies['year'].astype('Int64')
# Drop the original title column
movies.drop(columns=['title'], inplace=True)

# Display updated movie table
print("Updated Movie Table:")
movies.head(5)

Updated Movie Table:


Unnamed: 0,movie_id,genres,name,year
0,0,Animation|Children's|Comedy,Toy Story,1995
1,1,Adventure|Children's|Fantasy,Jumanji,1995
2,2,Comedy|Romance,Grumpier Old Men,1995
3,3,Comedy|Drama,Waiting to Exhale,1995
4,4,Comedy,Father of the Bride Part II,1995


In [82]:
# Split genres into separate rows
movie_genre_pairs = []
genre_set = set()

for _, row in movies.iterrows():
    genres = row['genres'].split('|')
    for genre in genres:
        genre_set.add(genre)
        movie_genre_pairs.append({'movie_id': row['movie_id'], 'genre': genre})

# Create genre table with unique IDs
genre_list = sorted(list(genre_set))
genre_df = pd.DataFrame({
    'genre_id': range(0, len(genre_list)),
    'genre': genre_list
})

In [83]:
# Create movie_genre mapping with genre_id
movie_genre_df = pd.DataFrame(movie_genre_pairs)
movie_genre_df = movie_genre_df.merge(genre_df, on='genre')[['movie_id', 'genre_id']]

# Create movie table
movie_df = movies[['movie_id', 'name', "year"]]

### `RATINGS DATA`


**Input:**  
Raw data containing the following fields: 
- `user_id` 
- `movie_id`
- `rating`: are made on a 5-star scale (whole-star ratings only)
-  `timestamp`: is represented in seconds since the epoch as returned by time(2)

**Output:**  
The raw data is cleaned and normalized into three separate relational tables to support efficient querying and database normalization:

**`ratings`**  
   - Fields: `user_id`, `movie_id`, `rating`, `year` 
   - Stores rating information.


In [84]:

# Read the Ratings File
ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_id'] = ratings['movie_id'] - 1

ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
1,0,660,3,978302109
2,0,913,3,978301968
3,0,3407,4,978300275
4,0,2354,5,978824291


### `USER DATA`


**Input:**  
Raw user demographic profiles containing the following fields:  
- `user_id`
- `gender`: F, M
- `age`: is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
- `Occupation`: is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"
- `zipcode`
**Output:**  
The raw data is cleaned and normalized into three separate relational tables to support efficient querying and database normalization:

1. **`users`**  
   - Fields: `user_id`, `gender`, `age_id`, `occupation_id`, `zip_code` 
   - Stores basic movie metadata.

2. **`ages`**  
   - Fields: `age_id`, `age_range`  
   - Contains the unique list of age range

3. **`occupation`**  
   - Fields: `occupation_id`, `title`  
   -  Contains the unique list of occupation


In [85]:
users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
# Re-index user_id from 0
users["user_id"] = users["user_id"] -1

In [86]:
users.head(5)

Unnamed: 0,user_id,gender,age,occupation,zipcode
0,0,F,1,10,48067
1,1,M,56,16,70072
2,2,M,25,15,55117
3,3,M,45,7,2460
4,4,M,25,20,55455


In [87]:
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
# Create age table
age_df = pd.DataFrame({
    'age_id': range(len(AGES)),
    'age_range': list(AGES.values())
})
age_df

# Mapping age with new id
age_mapping = {k: i for i, k in enumerate(AGES.keys())}
users["age"] = users["age"].map(age_mapping)


In [88]:
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }
# Create occupation table
occupation_df = pd.DataFrame({
    'occupation_id': range(len(OCCUPATIONS)),
    'title': list(OCCUPATIONS.values())
})
occupation_df.head(5)

Unnamed: 0,occupation_id,title
0,0,other or not specified
1,1,academic/educator
2,2,artist
3,3,clerical/admin
4,4,college/grad student


In [89]:

users.rename(columns={'occupation': 'occupation_id', "age":"age_id"}, inplace=True)
users.head(5)

Unnamed: 0,user_id,gender,age_id,occupation_id,zipcode
0,0,F,0,10,48067
1,1,M,6,16,70072
2,2,M,2,15,55117
3,3,M,4,7,2460
4,4,M,2,20,55455


In [90]:
# Store data into csv files
movie_df.to_csv("Preprocessed_Data/movies.csv", index=False)
genre_df.to_csv("Preprocessed_Data/genres.csv", index=False)
movie_genre_df.to_csv("Preprocessed_Data/movies-genre.csv", index=False)

ratings.to_csv("Preprocessed_Data/ratings.csv", index=False)

users.to_csv("Preprocessed_Data/users.csv", index=False)
age_df.to_csv("Preprocessed_Data/age.csv", index=False)
occupation_df.to_csv("Preprocessed_Data/occupation.csv", index=False)