In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [38]:
# Creating the Dataset takes incredibly long to run so we created intermediate checkpoints (saving the semi-processed data) to avoid the long process of creating the dataset again

try:
    data  = pd.read_excel('./data/processed_data/movie_lens_data.xlsx')
    data = data.drop(columns = ['Unnamed: 0'])

except:
    ### Datasets as described @ https://files.grouplens.org/datasets/movielens/ml-1m-README.txt
    print("Creating Dataset")
    data = pd.DataFrame(columns= ["UserID", "MovieID", "Rating", "Timestamp"])
    """
    RATINGS FILE DESCRIPTION
    ================================================================================

    All ratings are contained in the file "ratings.dat" and are in the
    following format:

    UserID::MovieID::Rating::Timestamp

    - UserIDs range between 1 and 6040 
    - MovieIDs range between 1 and 3952
    - Ratings are made on a 5-star scale (whole-star ratings only)
    - Timestamp is represented in seconds since the epoch as returned by time(2)
    - Each user has at least 20 ratings
    """
    
    ratings = [i.strip() for i in open("./data/raw_data/ratings.dat").readlines()]

    for rating in ratings:
        attr = rating.split('::')
        userID = attr[0]
        movieID = attr[1]
        score = attr[2]
        time = attr[3]

        data = data.append ({"UserID" : userID, "MovieID" : movieID, "Rating" : score, "Timestamp" : time}, ignore_index=True)

    ## Adding User Data
"""
    USERS FILE DESCRIPTION
    ================================================================================

    User information is in the file "users.dat" and is in the following
    format:

    UserID::Gender::Age::Occupation::Zip-code

    All demographic information is provided voluntarily by the users and is
    not checked for accuracy.  Only users who have provided some demographic
    information are included in this data set.

    - Gender is denoted by a "M" for male and "F" for female
    - Age is chosen from the following ranges:

        *  1:  "Under 18"
        * 18:  "18-24"
        * 25:  "25-34"
        * 35:  "35-44"
        * 45:  "45-49"
        * 50:  "50-55"
        * 56:  "56+"

    - Occupation is chosen from the following choices:

        *  0:  "other" or not specified
        *  1:  "academic/educator"
        *  2:  "artist"
        *  3:  "clerical/admin"
        *  4:  "college/grad student"
        *  5:  "customer service"
        *  6:  "doctor/health care"
        *  7:  "executive/managerial"
        *  8:  "farmer"
        *  9:  "homemaker"
        * 10:  "K-12 student"
        * 11:  "lawyer"
        * 12:  "programmer"
        * 13:  "retired"
        * 14:  "sales/marketing"
        * 15:  "scientist"
        * 16:  "self-employed"
        * 17:  "technician/engineer"
        * 18:  "tradesman/craftsman"
        * 19:  "unemployed"
        * 20:  "writer"
    """
    users = [i.strip() for i in open("./data/raw_data/users.dat").readlines()]
    user_data = pd.DataFrame(columns= ["UserID", "Gender", "Age", "Occupation","Zip-code"])

    for user in users:
        attr = user.split("::")
        user_id = attr[0]
        gender = attr[1]
        age = attr[2]
        occupation = attr[3]
        zip_code = attr[4]

        user_data = user_data.append({"UserID" : user_id, "Gender" : gender, "Age": age, "Occupation": occupation,                                                          "Zip-code":zip_code  } , ignore_index=True)

    data = data.join(user_data.set_index('UserID'), on='UserID')

    ## Adding Movie Data

    """
    MOVIES FILE DESCRIPTION
    ================================================================================

    Movie information is in the file "movies.dat" and is in the following
    format:

    MovieID::Title::Genres

    - Titles are identical to titles provided by the IMDB (including
    year of release)
    - Genres are pipe-separated and are selected from the following genres:

        * Action
        * Adventure
        * Animation
        * Children's
        * Comedy
        * Crime
        * Documentary
        * Drama
        * Fantasy
        * Film-Noir
        * Horror
        * Musical
        * Mystery
        * Romance
        * Sci-Fi
        * Thriller
        * War
        * Western
    """
    movies = [i.strip() for i in open("./data/raw_data/movies.dat").readlines()]
    movie_data = pd.DataFrame(columns= ["MovieID", "Title", "Genres"])

    for movie in movies:
        attr = movie.split("::")
        movie_id = attr[0]
        title = attr[1]
        genre = attr[2]

        movie_data = movie_data.append({"MovieID" : movie_id, "Title" : title, "Genres": genre} , ignore_index=True)


    data = data.join(movie_data.set_index('MovieID'), on='MovieID')
    
    #Clean the movie title
    data["MovieYear"] = data["Title"][0][-6:].strip("(").strip(")").strip(" ")
    data["Title"] = data["Title"].apply(lambda x: x[:-7])

    #Saving dataset into excel (Intermediate checkpoint)
    data.to_excel("./data/processed_data/movie_lens_data.xlsx")

In [10]:
data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,MovieYear
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest,Drama,1975
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach,Animation|Children's|Musical,1975
2,1,914,3,978301968,F,1,10,48067,My Fair Lady,Musical|Romance,1975
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich,Drama,1975
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A",Animation|Children's|Comedy,1975


In [11]:
try:
    movie_features = pd.read_excel('data/processed_data/imdb_data.xlsx')
    movie_features = movie_features.drop(columns = ['Unnamed: 0'])

except:
    ## Load IMDB Data

    """
    name.basics – Contains the following information for names:
        nconst (string) - alphanumeric unique identifier of the name/person
        primaryName (string)– name by which the person is most often credited
        birthYear – in YYYY format
        deathYear – in YYYY format if applicable, else '\N'
        primaryProfession (array of strings)– the top-3 professions of the person
        knownForTitles (array of tconsts) – titles the person is known for
    """
    name_basics = pd.read_csv('./data/raw_data/names.tsv', sep='\t')
    """
    title.basics.tsv.gz - Contains the following information for titles:
        tconst (string) - alphanumeric unique identifier of the title
        titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
        primaryTitle (string) – the title used by the filmmakers on promotional materials at the point of release
        originalTitle (string) - original title, in the original language
        isAdult (boolean) - 0: non-adult title; 1: adult title
        startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
        endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
        runtimeMinutes – primary runtime of the title, in minutes
        genres (string array) – includes up to three genres associated with the title
    """
    title_basic = pd.read_csv('./data/raw_data/title_basic.tsv', sep="\t")

    """
    title.ratings.tsv.gz – Contains the IMDb rating and votes information for titles
        tconst (string) - alphanumeric unique identifier of the title
        averageRating – weighted average of all the individual user ratings
        numVotes - number of votes the title has received
    """
    title_ratings = pd.read_csv('./data/raw_data/title_ratings.tsv', sep="\t")

    """
    title.crew.tsv.gz – Contains the director and writer information for all the titles in IMDb. Fields include:
        tconst (string) - alphanumeric unique identifier of the title
        directors (array of nconsts) - director(s) of the given title
        writers (array of nconsts) – writer(s) of the given title
    """
    title_crew = pd.read_csv('./data/raw_data/directors_writers.tsv', sep="\t")


    ## Data Processing
    #Join Ratings to movie
    movie_features = pd.merge(title_basic, title_ratings,on="tconst" )

    #Join Crew to movie
    movie_features = movie_features.merge(title_crew, on="tconst")

    #Filter out movies
    movie_features = movie_features[movie_features['titleType'] == "movie"]

    ##Saving IMDB dataset
    movie_features.to_excel("./data/processed_data/imdb_data.xlsx")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [41]:
## Combine both datasets and split to train and test

try:
    train = pd.read_excel('./data/processed_data/train_data.xlsx')
    test = pd.read_excel('./data/processed_data/test_data.xlsx')
    train = train.drop(columns = ['Unnamed: 0'])
    test = test.drop(columns = ['Unnamed: 0'])

except:
    final_data = pd.merge(data, movie_features, left_on=["Title"], right_on=["originalTitle"])
    train, test = train_test_split(final_data, test_size = 0.2, random_state = 10, shuffle= True)

    ##Saving Dataset
    train.to_excel('./data/processed_data/train_data.xlsx')
    test.to_excel('./data/processed_data/test_data.xlsx')