In [3]:
import pandas as pd
from urllib.request import urlretrieve
import zipfile
import numpy as np
import re
from sklearn import preprocessing
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

In [4]:
# README in ml-100k contains relevant information

# RUN ONCE
# print("Downloading movielens data...")
# urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")
# zip_ref = zipfile.ZipFile('movielens.zip', "r")
# zip_ref.extractall()

users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# The movies file contains a binary feature for each genre.
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [5]:
# Since the ids start at 1, we shift them to start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,0,24,M,technician,85711
1,1,53,F,other,94043
2,2,23,M,writer,32067
3,3,24,M,technician,43537
4,4,33,F,other,15213


In [6]:
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,195,241,3.0,881250949
1,185,301,3.0,891717742
2,21,376,1.0,878887116
3,243,50,2.0,880606923
4,165,345,1.0,886397596


In [7]:
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])

def get_all_genres(gs):
    active = [genre for genre, g in zip(genre_cols, gs) if g==1]
    if len(active) == 0:
        return 'Other'
    return '-'.join(active)

movies['all_genres'] = movies['all_genres'] = [
      get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genre_cols])]

movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,all_genres
0,0,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1995,Animation-Children-Comedy
1,1,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,1995,Action-Adventure-Thriller
2,2,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,Thriller
3,3,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1995,Action-Comedy-Drama
4,4,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,1,0,0,1995,Crime-Drama-Thriller


In [8]:
movielens = ratings.merge(movies, on='movie_id').merge(users, on='user_id')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Sci-Fi,Thriller,War,Western,year,all_genres,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,1997,Comedy,49,M,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,1,0,0,0,1997,Action-Adventure-Comedy-Sci-Fi,49,M,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,0,0,0,1996,Comedy-Romance,49,M,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,1996,Comedy,49,M,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,1994,Comedy-Drama,49,M,writer,55105


In [9]:
# Compute the number of movies to which a genre is assigned.
genre_occurences = movielens[genre_cols].sum().sort_values(ascending=False)
print(len(genre_occurences))
genre_occurences

19


Drama            39895
Comedy           29832
Action           25589
Thriller         21872
Romance          19461
Adventure        13753
Sci-Fi           12730
War               9398
Crime             8055
Children          7182
Horror            5317
Mystery           5245
Musical           4954
Animation         3605
Western           1854
Film-Noir         1733
Fantasy           1352
Documentary        758
genre_unknown       10
dtype: int64

In [10]:
movielens.describe()

Unnamed: 0,rating,unix_timestamp,video_release_date,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age
count,100000.0,100000.0,0.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,3.52986,883528900.0,,0.0001,0.25589,0.13753,0.03605,0.07182,0.29832,0.08055,...,0.01733,0.05317,0.04954,0.05245,0.19461,0.1273,0.21872,0.09398,0.01854,32.96985
std,1.125674,5343856.0,,0.01,0.436362,0.344408,0.186416,0.258191,0.457523,0.272144,...,0.130498,0.224373,0.216994,0.222934,0.395902,0.33331,0.41338,0.291802,0.134894,11.562623
min,1.0,874724700.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
25%,3.0,879448700.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0
50%,4.0,882826900.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0
75%,4.0,888260000.0,,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0
max,5.0,893286600.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,73.0


In [11]:
# Convert Sex to categorical values
label_encoder_sex = preprocessing.LabelEncoder()
label_encoder_sex.fit(movielens['sex'])
movielens['sex'] = label_encoder_sex.transform(movielens['sex'])
movielens.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,Sci-Fi,Thriller,War,Western,year,all_genres,age,sex,occupation,zip_code
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,1997,Comedy,49,1,writer,55105
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,1,0,0,0,1997,Action-Adventure-Comedy-Sci-Fi,49,1,writer,55105
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,0,0,0,1996,Comedy-Romance,49,1,writer,55105
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,1996,Comedy,49,1,writer,55105
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,1994,Comedy-Drama,49,1,writer,55105


In [12]:
# Convert occupation to one hot encoding
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

movielens = encode_and_bind(movielens, 'occupation')
movielens.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,release_date,video_release_date,imdb_url,genre_unknown,Action,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,195,241,3.0,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,1
1,195,256,2.0,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,0,0,0,0,0,0,0,0,0,1
2,195,110,4.0,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,0,0,0,0,0,0,0,0,1
3,195,24,4.0,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,0,0,0,0,0,1
4,195,381,4.0,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
lemmatizer = WordNetLemmatizer() 
def filtering(data):
    ''' Removing any non alphanumeric letters and extra spaces.
        Lemmatizing words with length greater than 3.    
    '''    
    patterns = {r'[^a-zA-Z\s]': '',                 
                r'[\s+]': ' '}

    for pattern, result in patterns.items():    
        data = re.sub(pattern, result, data)
    
    data = ' '.join([lemmatizer.lemmatize(word) for word in data.split(' ') if len(word) > 2])    
    return data 

movielens['title'] = movielens['title'].apply(filtering)

# Testing
# movielens['title'].apply(filtering)

In [13]:
label_encoder_sex = preprocessing.LabelEncoder()
label_encoder_sex.fit(movielens['sex'])
movielens['sex'] = label_encoder_sex.transform(movielens['sex'])

In [20]:
# Create function for it
vectorizer = TfidfVectorizer()
title_tf_idf = vectorizer.fit_transform(movielens['title'])
print(title_tf_idf)

# TypeError: cannot concatenate object of type '<class 'scipy.sparse.csr.csr_matrix'>'; 
# only Series and DataFrame objs are valid
# movielens = pd.concat([movielens, X], axis=1)

  (0, 1261)	0.945205123295251
  (0, 75)	0.3264770664172435
  (1, 1428)	0.5743983745460041
  (1, 1143)	0.42918942658404663
  (1, 296)	0.6380849588041825
  (1, 76)	0.2805539676697347
  (2, 75)	0.197388596537977
  (2, 2230)	0.5043413270174262
  (2, 93)	0.4547058053957427
  (2, 435)	0.4976045962844306
  (2, 688)	0.4750032148868511
  (2, 2153)	0.16334567628879296
  (3, 75)	0.34907307637287766
  (3, 2153)	0.2888696648864757
  (3, 289)	0.891460769781795
  (4, 2153)	0.25415237058083134
  (4, 106)	0.4062583956321108
  (4, 1582)	0.42099209748523636
  (4, 1720)	0.4475787543324421
  (4, 1748)	0.3933450781299139
  (4, 656)	0.4470683256485278
  (4, 73)	0.19547164179123666
  (5, 1002)	0.734249037140736
  (5, 621)	0.5514478975278801
  (5, 72)	0.39595904809712673
  :	:
  (99992, 1477)	0.6246068676476536
  (99993, 76)	0.2053347022737972
  (99993, 2001)	0.5345377705853209
  (99993, 1943)	0.548320633793997
  (99993, 1353)	0.6094682226462305
  (99994, 76)	0.2640891962215204
  (99994, 121)	0.558232806338453

In [14]:
# Zip Codes data has alphanumeric strings
# More info from zip code to extract or not?
# series type object
# ValueError: invalid literal for int() with base 10: 'E2A4H'
# movielens_filtered['zip_code'] = movielens_filtered['zip_code'].apply(lambda x: int(x))

In [29]:
# Remove unwanted columns
# Keeping rating, all genres, age, sex, one hot encoding of occupation.
# How to incorporate sparse matrix of title
# Should occupation be one hot encoding or labelled
# Is year important
# When train test split do random shuffle split
movies = movielens[['user_id','movie_id','title']]
movielens_filtered = movielens.drop(columns=['unix_timestamp', 'title', 'release_date', 
                                             'video_release_date', 'imdb_url', 'all_genres', 'occupation', 'zip_code', 'year'])

# Fill nan values
movielens_filtered = movielens_filtered.fillna(movielens_filtered.mean())

# Year has str -> 'nan'
# Removing it currently. A better solution like ffill() and bfill()
movies.head()
movielens_filtered.head()

Unnamed: 0,user_id,movie_id,rating,genre_unknown,Action,Adventure,Animation,Children,Comedy,Crime,...,occupation_marketing,occupation_none,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer
0,195,241,3.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,195,256,2.0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,195,110,4.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,195,24,4.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,195,381,4.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [30]:
print(movielens_filtered.columns)
print(movielens_filtered.dtypes)

Index(['user_id', 'movie_id', 'rating', 'genre_unknown', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', 'age', 'sex',
       'occupation_administrator', 'occupation_artist', 'occupation_doctor',
       'occupation_educator', 'occupation_engineer',
       'occupation_entertainment', 'occupation_executive',
       'occupation_healthcare', 'occupation_homemaker', 'occupation_lawyer',
       'occupation_librarian', 'occupation_marketing', 'occupation_none',
       'occupation_other', 'occupation_programmer', 'occupation_retired',
       'occupation_salesman', 'occupation_scientist', 'occupation_student',
       'occupation_technician', 'occupation_writer'],
      dtype='object')
user_id                      object
movie_id                     object
rating                      float64
genre_unknown                 int64


In [31]:
# Saving datasets
scipy.sparse.save_npz('Dataset/title_tf_idf.npz', title_tf_idf)
movielens_filtered.to_csv('Dataset/movielens_preprocessed.csv', index=False)
movies.to_csv('Dataset/movies.csv', index=False)