In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [3]:
# Creating the Dataset takes incredibly long to run so we created intermediate checkpoints (saving the semi-processed data) to avoid the long process of creating the dataset again

try:
    data  = pd.read_csv('./data/processed_data/movie_lens_data.csv')
    data = data.drop(columns = ['Unnamed: 0'])

except:
    ### Datasets as described @ https://files.grouplens.org/datasets/movielens/ml-25m-README.html
    print("Creating Dataset")
    """
    All ratings are contained in the file ratings.csv.
    Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
    
    userId,movieId,rating,timestamp

    The lines within this file are ordered first by userId, then, within user, by movieId.

    Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

    Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
    """
    # ratings = pd.read_csv('./data/raw_data/ratings.csv')
    data = pd.read_csv('./data/raw_data/ratings.csv')
    ## Adding Tag Data
    """
    Tags Data File Structure (tags.csv)
    All tags are contained in the file tags.csv. 
    Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
    
    userId,movieId,tag,timestamp
    
    The lines within this file are ordered first by userId, then, within user, by movieId.

    Tags are user-generated metadata about movies. 
    Each tag is typically a single word or short phrase.
    The meaning, value, and purpose of a particular tag is determined by each user.

    Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970
    """
    # tag = pd.read_csv('./data/raw_data/tags.csv')

    # #join tag and ratings
    # data = pd.merge(ratings, tag, on = ["userId", "movieId"], suffixes = ["_ratings","_tags"])

    ## Adding Movie Data

    """
    Movie information is contained in the file movies.csv.
    Each line of this file after the header row represents one movie, and has the following format:

    movieId,title,genres

    Movie titles are entered manually or imported from https://www.themoviedb.org/, and include the year of release in parentheses.
    Errors and inconsistencies may exist in these titles.

    Genres are a pipe-separated list, and are selected from the following:

    Action
    Adventure
    Animation
    Children's
    Comedy
    Crime
    Documentary
    Drama
    Fantasy
    Film-Noir
    Horror
    Musical
    Mystery
    Romance
    Sci-Fi
    Thriller
    War
    Western
    (no genres listed)
    """
    movies = pd.read_csv('./data/raw_data/movies.csv')


    data = data.merge(movies, on='movieId')
    
    #Clean the movie title
    data["MovieYear"] = data["title"][0][-6:].strip("(").strip(")").strip(" ")
    data["title"] = data["title"].apply(lambda x: x[:-7])

    #Saving dataset into excel (Intermediate checkpoint)
    data.to_csv("./data/processed_data/movie_lens_data.csv")

Creating Dataset


In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,MovieYear
0,1,296,5.0,1147880044,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
1,3,296,5.0,1439474476,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
2,4,296,4.0,1573938898,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
3,5,296,4.0,830786155,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
4,7,296,4.0,835444730,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994


In [6]:
try:
    movie_features = pd.read_excel('data/processed_data/imdb_data.xlsx')
    movie_features = movie_features.drop(columns = ['Unnamed: 0'])

except:
    """ 
    name.basics – Contains the following information for names:
        nconst (string) - alphanumeric unique identifier of the name/person
        primaryName (string)– name by which the person is most often credited
        birthYear – in YYYY format
        deathYear – in YYYY format if applicable
        primaryProfession (array of strings)– the top-3 professions of the person
        knownForTitles (array of tconsts) – titles the person is known for
    """
    name_basics = pd.read_csv('./data/raw_data/names.tsv', sep='\t')
    """
    title.basics.tsv.gz - Contains the following information for titles:
        tconst (string) - alphanumeric unique identifier of the title
        titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
        primaryTitle (string) – the title used by the filmmakers on promotional materials at the point of release
        originalTitle (string) - original title, in the original language
        isAdult (boolean) - 0: non-adult title; 1: adult title
        startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
        endYear (YYYY) – TV Series end year. 
        runtimeMinutes – primary runtime of the title, in minutes
        genres (string array) – includes up to three genres associated with the title
    """
    title_basic = pd.read_csv('./data/raw_data/title_basic.tsv', sep="\t")

    """
    title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles
        tconst (string) - alphanumeric unique identifier of the title
        averageRating – weighted average of all the individual user ratings
        numVotes - number of votes the title has received
    """
    title_ratings = pd.read_csv('./data/raw_data/title_ratings.tsv', sep="\t")

    """
    title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:
        tconst (string) - alphanumeric unique identifier of the title
        directors (array of nconsts) - director(s) of the given title
        writers (array of nconsts) – writer(s) of the given title
    """
    title_crew = pd.read_csv('./data/raw_data/directors_writers.tsv', sep="\t")


    ## Data Processing
    #Join Ratings to movie
    movie_features = pd.merge(title_basic, title_ratings,on="tconst" )

    #Join Crew to movie
    movie_features = movie_features.merge(title_crew, on="tconst")

    #Filter out movies
    movie_features = movie_features[movie_features['titleType'] == "movie"]

    ##Saving IMDB dataset
    movie_features.to_excel("./data/processed_data/imdb_data.xlsx")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [48]:
## Combine both datasets and split to train and test

try:
    train = pd.read_excel('./data/processed_data/train_data.xlsx')
    test = pd.read_excel('./data/processed_data/test_data.xlsx')
    train = train.drop(columns = ['Unnamed: 0'])
    test = test.drop(columns = ['Unnamed: 0'])

except:
    final_data = pd.merge(data, movie_features, left_on=["title"], right_on=["originalTitle"])
    train, test = train_test_split(final_data, test_size = 0.2, random_state = 10, shuffle= True)

    ##Saving Dataset
    train.to_excel('./data/processed_data/train_data.xlsx')
    test.to_excel('./data/processed_data/test_data.xlsx')
    final_data.to_excel('./data/processed_data/full_data.xlsx')