In [None]:
import pandas as pd

LOADING THE DATA

In [2]:
#Loading User Data
def load_data():
    ratings = pd.read_csv('../movies_dataset/ratings.dat',
                sep="::", header=None, 
                names=["user_id", "movie_id", "rating", "timestamp"],
                engine='python')

    movies = pd.read_csv('../movies_dataset/movies.dat',
                sep="::", header=None, 
                names=["movie_id", "title", "genres"],
                engine='python',
                encoding='latin-1')

    users = pd.read_csv('../movies_dataset/users.dat',
                sep="::", header=None, 
                names=["user_id", "gender", "age", "occupation", "zip-code"],
                engine='python')
    return ratings, movies, users

In [3]:
ratings, movies, users = load_data()

ANALYZING THE DATA

In [4]:
print("=== RATINGS DATA ===")
print(f"Shape: {ratings.shape}")
print("\nFirst 5 rows:")
print(ratings.head())
print("\nData types:")
print(ratings.dtypes)
print("\nBasic info:")
print(ratings.info())

=== RATINGS DATA ===
Shape: (1000209, 4)

First 5 rows:
   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291

Data types:
user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object

Basic info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   user_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None


In [5]:
print("\n=== MOVIES DATA ===")
print(f"Shape: {movies.shape}")
print("\nFirst 5 rows:")
print(movies.head())
print("\nData types:")
print(movies.dtypes)


=== MOVIES DATA ===
Shape: (3883, 3)

First 5 rows:
   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy

Data types:
movie_id     int64
title       object
genres      object
dtype: object


In [6]:
print("\n=== USERS DATA ===")
print(f"Shape: {users.shape}")
print("\nFirst 5 rows:")
print(users.head())
print("\nData types:")
print(users.dtypes)


=== USERS DATA ===
Shape: (6040, 5)

First 5 rows:
   user_id gender  age  occupation zip-code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455

Data types:
user_id        int64
gender        object
age            int64
occupation     int64
zip-code      object
dtype: object


In [7]:
print("\n=== MISSING VALUES ===")
print("Ratings missing values:", ratings.isnull().sum().sum())
print("Movies missing values:", movies.isnull().sum().sum())
print("Users missing values:", users.isnull().sum().sum())


=== MISSING VALUES ===
Ratings missing values: 0
Movies missing values: 0
Users missing values: 0


In [8]:
print("\n=== BASIC STATISTICS ===")
print("Number of unique users:", ratings['user_id'].nunique())
print("Number of unique movies:", ratings['movie_id'].nunique())
print("Rating range:", ratings['rating'].min(), "to", ratings['rating'].max())
print("Total ratings:", len(ratings))


=== BASIC STATISTICS ===
Number of unique users: 6040
Number of unique movies: 3706
Rating range: 1 to 5
Total ratings: 1000209


In [9]:
print("\n=== DATA CONSISTENCY CHECKS ===")
print("Users in ratings but not in users table:", 
      len(set(ratings['user_id']) - set(users['user_id'])))
print("Movies in ratings but not in movies table:", 
      len(set(ratings['movie_id']) - set(movies['movie_id'])))


=== DATA CONSISTENCY CHECKS ===
Users in ratings but not in users table: 0
Movies in ratings but not in movies table: 0
