# Ratings - Data Cleaning

In [33]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [34]:
path = "../../data/small"
init_ratings = pd.read_csv(f"{path}/ratings.csv")

## Ratings
All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:

    userId,movieId,rating,timestamp

The lines within this file are ordered first by userId, then, within user, by movieId.

Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.


In [35]:
def init_pipeline(df):
    return df.copy()


In [36]:
def adjust_dtypes(df):
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") 
    return df


In [37]:
def normalize_ratings(df):
        sparse_matrix = df.pivot(index=["movieId", "timestamp"], values="rating", columns="userId")
        transformer = MinMaxScaler()
        scaled_sparse_matrix = pd.DataFrame(
                transformer.fit_transform(sparse_matrix),
                columns=sparse_matrix.columns,
                index=sparse_matrix.index
                )
        normed_ratings =( 
                scaled_sparse_matrix
                        .stack()
                        .reset_index()
                        .rename(columns={0 : "rating" })
                        .sort_values(["userId", "movieId"])
                        .reset_index(drop=True)
        )
        normed_ratings["rating"] = normed_ratings["rating"].apply(lambda x: x*5)
        normed_ratings
        
        return normed_ratings

In [38]:
def save_csv(df):
    df.to_csv("../../data/clean/ratings_norm.csv", index=False)
    return df


In [39]:
ratings = (
    init_ratings
        .pipe(init_pipeline)
        #.pipe(missing_values)
        .pipe(adjust_dtypes)
        .pipe(normalize_ratings)
        .pipe(save_csv)
) 
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   movieId    100836 non-null  int64         
 1   timestamp  100836 non-null  datetime64[ns]
 2   userId     100836 non-null  int64         
 3   rating     100836 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 3.1 MB


In [40]:
ratings.head(5)

Unnamed: 0,movieId,timestamp,userId,rating
0,1,2000-07-30 18:45:03,1,3.75
1,3,2000-07-30 18:20:47,1,3.75
2,6,2000-07-30 18:37:04,1,3.75
3,47,2000-07-30 19:03:35,1,5.0
4,50,2000-07-30 18:48:51,1,5.0
