In [15]:
# Importing packages for data cleanup
import pandas as pd
import time
import datetime
from math import ceil
from os import path, makedirs
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)

In [31]:
mnames = ['movie_id', 'title', 'genre']
movies_df = pd.read_table('ml-1m/movies.dat', names = mnames, sep = "::", engine = 'python', encoding='ISO-8859-1')

In [17]:
# Loading the cleaned datasets
rnames = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_table("ml-1m/ratings.dat", header =None, sep='::',names=rnames, engine= 'python')

In [18]:
uname = ['user_id','gender','age','occupation','zip']
users_df = pd.read_table("ml-1m/users.dat", sep='::', header = None, names=uname, engine='python')

In [32]:
# Check for missing values
missing_vals = ratings_df.isnull().sum()
print(missing_vals, '\n')
perc = round(missing_vals / ratings_df.shape[0] * 100, 2)
print(f'There are {ratings_df.shape[0]} rows in the dataset.')
print(f'Proportion of missing data for each column in %: \n{perc}')

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64 

There are 1000209 rows in the dataset.
Proportion of missing data for each column in %: 
user_id      0.0
movie_id     0.0
rating       0.0
timestamp    0.0
dtype: float64


In [33]:
genres = []
for genre in movies_df['genre']:
    genre_list = genre.split('|')
    genres.extend(genre_list)
genres = list(set(genres))
genres.sort()
print(genres)

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [34]:
for genre in genres:
    movies_df.loc[:,genre] = 0

In [35]:
movies_df.head()

Unnamed: 0,movie_id,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
for row in movies_df.iterrows():
    index = row[0]
    genre_data = row[1][2].split("|")

    genres = movies_df.columns.values.tolist()
    genres.remove('title')
    genres.remove('genre')
    genres.remove('movie_id')

    for genre in genres:
        if genre in genre_data:
            movies_df.loc[index, genre] = 1

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Weste

In [37]:
movies_df.head()

Unnamed: 0,movie_id,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
movies_df.drop(labels=['genre'], axis=1, inplace=True)
movies_df.to_csv('films.csv', index=False)