# Train/Validation/Test Split

- Kevin Chuang
- Yelp Academic Dataset (01.2019)

In [1]:
import os
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from numpy.linalg import norm
from sklearn.pipeline import FeatureUnion
from scipy.sparse import coo_matrix
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn import base
from ast import literal_eval

%matplotlib inline

In [2]:
# Load data

rest_df = pd.read_csv("final_yelp_restaurant_data_v3.csv")

In [3]:
rest_df.shape

(1684896, 12)

In [4]:
rest_df.head()

Unnamed: 0,business_id,date,review_id,stars,text,user_id,attributes,name,city,categories,avg_business_stars,avg_user_star
0,u8C8pRvaHXg3PgDrsUHJHQ,2004-10-19 19:24:13,0QHCY_55TFHHvyumEMpDew,4.0,Good stuff. Pricey by normal pizza standards.,nkN_do3fJ9xekchVC-v68A,"{'GoodForKids': 'True', 'BikeParking': 'True',...",Papa Del's Pizza,Champaign,"Food Delivery Services, Salad, Restaurants, Fo...",3.5,3.7
1,EZOoB2D8uQHV_gJoGCMTxQ,2004-10-19 21:33:08,1Iobyi_7BkFON25Oegs0aw,4.0,Love their subs. Cheap and top shelf ingredients.,nkN_do3fJ9xekchVC-v68A,"{'GoodForKids': 'True', 'BikeParking': 'True',...",Jimmy John's,Champaign,"Restaurants, American (Traditional), Food, Del...",3.0,3.7
2,oYMsq2Xvzw6UbrIlMWjb-A,2004-10-19 21:34:40,2F5J51OYtD49eyIUKJKVgg,4.0,Love their pizza. They used to have a great ta...,nkN_do3fJ9xekchVC-v68A,"{'HasTV': 'False', 'Alcohol': ""u'none'"", 'Rest...",Papa John's Pizza,Champaign,"Restaurants, Pizza",2.5,3.7
3,-FLnsWAa4AGEW4NgE8Fqew,2009-03-15 20:38:02,lwln20Y19e78B2Pn-nxCVw,3.0,An okay breakfast spot.\n\nThe good news? If y...,nkN_do3fJ9xekchVC-v68A,"{'RestaurantsGoodForGroups': 'True', 'Corkage'...",Breakfast Club- Scottsdale,Scottsdale,"Coffee & Tea, Turkish, American (Traditional),...",4.0,3.7
4,rtlsfmdufArhk-47sWIf2w,2013-04-27 00:49:49,q4PwVSBgR-DAJyxWO4hHZA,4.0,A surprisingly intimate restaurant in the Bell...,nkN_do3fJ9xekchVC-v68A,"{'RestaurantsTakeOut': 'False', 'OutdoorSeatin...",Le Cirque,Las Vegas,"French, Restaurants",4.5,3.7


## Train/Val/Test Split

- We want to split the data so that users are represented in each of the datasets (i.e. stratify based on user id), because this is a recommender system problem
- Train set will be used for training, validation for tuning hyperparameters and testing for final evaluation

In [5]:
X_train, X_test, y_train, y_test = train_test_split(rest_df.drop(columns=["stars"], axis=1), rest_df["stars"].values, 
                                                    stratify=rest_df["user_id"].values, test_size=0.20, random_state=8)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                  stratify=X_train['user_id'], test_size=0.20, random_state=8)

In [7]:
assert X_train.shape[0] == len(y_train)
assert X_test.shape[0] == len(y_test)
assert X_val.shape[0] == len(y_val)

In [8]:
print("Number of training instances: {}".format(X_train.shape[0]))
print("Number of validation instances: {}".format(X_val.shape[0]))
print("Number of test instances: {}".format(X_test.shape[0]))

Number of training instances: 1078332
Number of validation instances: 269584
Number of test instances: 336980


In [9]:
y_train[0:5]

array([1., 4., 4., 4., 5.])

In [10]:
X_train['stars'] = y_train

In [11]:
assert not X_train.isnull().values.any()

In [12]:
X_train.to_csv("yelp_restaurant_train.csv", index=False)

In [13]:
X_val['stars'] = y_val

In [14]:
assert not X_val.isnull().values.any()

In [15]:
X_val.to_csv("yelp_restaurant_validation.csv", index=False)

In [16]:
X_test['stars'] = y_test

In [17]:
assert not X_test.isnull().values.any()

In [18]:
X_test.to_csv("yelp_restaurant_test.csv", index=False)