In [1]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile


def download_and_unzip(url, extract_to='.'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)


download_and_unzip("https://files.grouplens.org/datasets/movielens/ml-100k.zip",
                   extract_to='.')

Next, let's import all of the modules that we'll use in this notebook.

In [3]:
# Standard library imports
import random
import time

# Third-party imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm.notebook import tqdm
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [18]:
def make_dfs():
  columns_name=['user_id','item_id','rating','timestamp']
  df = pd.read_csv("./ml-100k/u.data",sep="\t",names=columns_name)

  df = df[df['rating']>=3]
  print("Df lenght:", len(df))

  train, test = train_test_split(df.values, test_size=0.1, random_state=16)
  train_df = pd.DataFrame(train, columns=df.columns)
  test_df = pd.DataFrame(test, columns=df.columns)

  train_user_ids = train_df['user_id'].unique()
  train_item_ids = train_df['item_id'].unique()


  test_df = test_df[
    (test_df['user_id'].isin(train_user_ids)) & \
    (test_df['item_id'].isin(train_item_ids))
  ]

  return train_df, test_df

In [19]:
def preproc(train_df, test_df):
  film_columns = ["item_id", "movie title", "release date", "video release date",
              "IMDb URL", "unknown", "Action", "Adventure", "Animation",
              "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
              "Thriller", "War", "Western"]

  films_df = pd.read_csv("./ml-100k/u.item", sep="|", names=film_columns, encoding='latin-1')
  films_df.drop(["movie title", "release date", "IMDb URL", "unknown", "video release date"], axis = 1, inplace = True)

  train_df = pd.merge(train_df, films_df, how='left', left_on='item_id', right_on='item_id')
  test_df = pd.merge(test_df, films_df, how='left', left_on='item_id', right_on='item_id')

  user_columns = ["user_id", "age", "sex", "occupation", "zip_code"]
  user_df = pd.read_csv("./ml-100k/u.user", sep="|", names=user_columns, encoding='latin-1')
  user_df["sex"] = pp.LabelEncoder().fit_transform(user_df["sex"])
  occup_df = pd.read_csv("./ml-100k/u.occupation", sep="\t", names=["jobs"])
  le = pp.LabelEncoder()
  le.fit(occup_df["jobs"])
  user_df["occupation"] = le.transform(user_df["occupation"])
  user_df.drop(["zip_code"], axis = 1, inplace = True)

  train_df = pd.merge(train_df, user_df, how='left', left_on='user_id', right_on='user_id')
  test_df = pd.merge(test_df, user_df, how='left', left_on='user_id', right_on='user_id')

  train_df.drop(["item_id", "user_id", "timestamp"], axis = 1, inplace = True)
  train_y = train_df["rating"].values
  train_x = train_df.drop('rating', axis=1).values

  test_df.drop(["item_id", "user_id", "timestamp"], axis = 1, inplace = True)
  test_y = test_df["rating"].values
  test_x = test_df.drop('rating', axis=1).values


  return train_x, train_y, test_x, test_y


In [20]:
train_df, test_df = make_dfs()
train_x, train_y, test_x, test_y = preproc(train_df, test_df)

Df lenght: 82520


In [21]:
train_x[1]

array([ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  0,
        0, 24,  1,  4])

In [71]:
!pip install tpot



In [22]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=5, population_size=10, verbosity=2, random_state=42, early_stop=3)
tpot.fit(train_x, train_y)

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.4444983204235945

Generation 2 - Current best internal CV score: 0.4444983204235945

Generation 3 - Current best internal CV score: 0.4507728569748809

Generation 4 - Current best internal CV score: 0.4511498443917394

Generation 5 - Current best internal CV score: 0.4511498443917394

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.55, min_samples_leaf=19, min_samples_split=17, n_estimators=100)


In [23]:
tpot.score(test_x, test_y)

0.4451456310679612