In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime


In [None]:
ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    engine='python',
    names=['UserID', 'MovieID', 'Rating', 'Timestamp']
)

In [None]:
movies = pd.read_csv(
    'ml-1m/movies.dat',
    sep='::',
    engine='python',
    names=['MovieID', 'Title', 'Genres'],
    encoding='latin-1'  # or 'iso-8859-1'
)

In [None]:
users = pd.read_csv(
    'ml-1m/users.dat',
    sep='::',
    engine='python',
    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
)

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
users.head()

In [None]:
data = pd.merge(ratings, movies, on='MovieID')
data = pd.merge(data, users, on='UserID')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data['Rating'].plot(kind='hist')

In [None]:
data.nunique()

In [None]:
age_group_counts = data.groupby('Age').size()
print(age_group_counts)

### Age column is binned, so I will treat each as a respective group i.e '1' will be analyzed as '<18','18' will be '18-24','25' will be '25-34','35' will be '35-44', '45' will be '45-49', '50' will be '50-55', and '56' will be '56+'


In [None]:
## Theory: Time of day may effect genre/rating of movie.

data['Hour'] = pd.to_datetime(data['Timestamp'], unit='s').dt.hour
data.head()

In [None]:
#Extract Year out of Title column and update Title column
data['Year'] = data['Title'].str[-5:-1]
data['Title'] = data['Title'].str[:-7]
data.head()

In [None]:
#Convert Gender into binary representation
data['Gender'] = data['Gender'].replace({'M': 1, 'F': 0})
data.head()

In [None]:
#Convert occupation into one-hot representations to avoid ordinal/magnitude assumptions by model later on in development
occupation_dummies = pd.get_dummies(data['Occupation'], prefix='Occupation')
data = pd.concat([data, occupation_dummies], axis=1)
data.head()

In [None]:
#Extract first 3 numbers from zipcode and convert to one-hot representation. Retains approximate geographic regions which may help model performance, but reduces dimensionality of using full zipcode
data['ZipPrefix'] = data['Zip-code'].str[:3]
zip_dummies = pd.get_dummies(data['ZipPrefix'], prefix='Zip')
data = pd.concat([data.drop(['Zip-code'], axis=1), zip_dummies], axis=1)
data.head()

In [None]:
data['ZipPrefix'].nunique()

In [None]:
#Converting Genres into one-hot encodings. To get a general understanding of features that effect model performance I will start with one-hot representation,
#but may need to use embeddings to figure out genre relationships, but will use one-hot as preliminary representation/
genre_dummies = data['Genres'].str.get_dummies(sep='|')
data = pd.concat([data.drop('Genres', axis=1), genre_dummies], axis=1)
data.head()

In [None]:
print('\nTitles with more than 50 tokens:', data[data['Title'].str.len().gt(50)].shape[0])

In [None]:
plt.hist(data['Title'].str.len());
plt.ylabel('Count of Titles')
plt.xlabel('Number of tokens per Title')

In [None]:
data['Rating'].value_counts()