In [1]:
import time
import gc
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
import joblib
import random
import os
import glob
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.set_option('future.no_silent_downcasting', True)
verbose = True

In [2]:
yelp_business_df_path = 'files/busCleaned.csv'
yelp_user_df_path = 'files/user_cleaned.csv'
review_chunks_path = f"files/review_chunks/no_label/rating_group"
chunk_pattern = 'review_chunk_star'

final_vectorizer_path = 'files/rating_vizer.pkl'
final_svd_path = 'files/rating_svd.pkl'

final_train_df_path = 'files/final_train_df.csv'

xtrain_path = 'models/X_train.npy'
xtest_path = 'models/X_test.npy'
ytrain_path = 'models/y_train.npy'
ytest_path = 'models/y_test.npy'
cat_ind_path = 'models/cat_ind.npy' # categorical indices for LightBGM
train_ind_path = 'models/train_ind.npy' # the chunks we trained on, don't want to test on these

# Try stuff so I don't just keep getting errors if something goes wrong
try:
    print('Trying to load leaned business and cleaned user files')
    business_df = pd.read_csv(yelp_business_df_path)
    user_df = pd.read_csv(yelp_user_df_path)
    print('Files loaded successfully')
except FileNotFoundError:
    print('One or more files not found. Cannot continue.')


Trying to load leaned business and cleaned user files
Files loaded successfully


In [3]:
def load_and_concat_chunks(file_list):
    # Loads multiple CSV chunks into a single DataFrame.
    list_of_dfs = []
    if not file_list: # Handle case where selection failed
        return pd.DataFrame()
    for filename in file_list:
        try:
            df_chunk = pd.read_csv(filename)
            list_of_dfs.append(df_chunk)
        except Exception as e:
            print(f"Error loading {filename}: {e}")
    if not list_of_dfs: # return empty data frame if file_list is empty
        return pd.DataFrame()
    return pd.concat(list_of_dfs, axis=0, ignore_index=True)

num_chunks = 34
num_chunks_select = 10

random.seed(42)
# pick 8 random numbers to be the 4 chunks we train on and the four chunks we evaluate on.
selected_indices = random.sample(range(num_chunks), num_chunks_select)
train_chunk_indices = selected_indices[:5]

test_chunk_indices = selected_indices[5:]

full_pattern = os.path.join(review_chunks_path, chunk_pattern)
all_chunk_files = sorted(glob.glob(full_pattern))

train_files_to_load = [all_chunk_files[i] for i in train_chunk_indices]
test_files_to_load = [all_chunk_files[i] for i in test_chunk_indices]

train_df = load_and_concat_chunks(train_files_to_load)
test_df = load_and_concat_chunks(test_files_to_load)

# also drop postal codes, they got letters in there
business_df = business_df.drop(['postal_code'], axis=1)
business_df = business_df.fillna(-1) # fill in empty values with -1, should learn that means missing/unknown

In [5]:
# merge dataframes
final_train_df = train_df.merge(business_df, on = 'business_id', how = 'left')
final_train_df = final_train_df.merge(user_df, on = 'user_id', how = 'left')
final_train_df.to_csv(final_train_df_path, index = False)
final_df = final_train_df.copy()


final_test_df = test_df.merge(business_df, on = 'business_id', how = 'left')
final_test_df = final_test_df.merge(user_df, on = 'user_id', how = 'left')
final_test_df.to_csv(final_test_df_path, index = False)



# attributes most likely for a business and user to have
# for checking floating reviews
check_cols = [
    'ByAppointmentOnly', 
    'RestaurantsPriceRange2',
    'BusinessAcceptsCreditCards',
    'RestaurantsPriceRange2',
    'account_age_years'
]
final_df = final_df.dropna(subset = check_cols)
final_df = final_df.dropna(subset=['text'])

# clear out the original dataframes from memory
del business_df, user_df
gc.collect()

490

In [6]:
t0 = time.time()
print('Starting vectorization')

# vectorize review text with tf-idf
vizer = TfidfVectorizer(max_features = 100000)
x_text = vizer.fit_transform(final_df['text'])

# reducing vectorized text dimensions, it gives me over 300,000 features
n_components = 300 
svd = TruncatedSVD(n_components=n_components, random_state=42)
x_text_reduced = svd.fit_transform(x_text)

# reduce the text size
text_reduced = x_text_reduced.astype(np.float32)
print(f'text vectorized after {time.time() - t0} seconds')

# save the fitted vectorizers
joblib.dump(vizer, final_vectorizer_path)
joblib.dump(svd, final_svd_path)
# dump them, I need all the memory I can get
del vizer, svd, x_text
gc.collect()

Starting vectorization
text vectorized after 154.39377331733704 seconds


0

In [8]:
# splits the final data frame into the parts we're going to use
drop_cols = ['user_id', 'business_id', 'text']

final_df = final_df.drop(drop_cols, axis=1)
# convert broad_category to numeric
label_encoder = LabelEncoder()

final_df = final_df.convert_dtypes()

y = final_df['stars']
final_df = final_df.drop(['stars'], axis=1)

categorical_features = ['ByAppointmentOnly', 'BusinessAcceptsCreditCards', 'BikeParking',
       'RestaurantsPriceRange2', 'RestaurantsTakeOut', 'RestaurantsDelivery',
       'Caters', 'WiFi', 'WheelchairAccessible', 'HappyHour', 'OutdoorSeating',
       'HasTV', 'RestaurantsReservations', 'DogsAllowed', 'Alcohol',
       'GoodForKids', 'RestaurantsAttire', 'RestaurantsTableService',
       'RestaurantsGoodForGroups', 'NoiseLevel', 'BusinessAcceptsBitcoin',
       'BusinessParking_garage', 'BusinessParking_street',
       'BusinessParking_validated', 'BusinessParking_lot',
       'BusinessParking_valet', 'Ambience_romantic', 'Ambience_intimate',
       'Ambience_touristy', 'Ambience_hipster', 'Ambience_divey',
       'Ambience_classy', 'Ambience_trendy', 'Ambience_upscale',
       'Ambience_casual', 'GoodForMeal_dessert', 'GoodForMeal_latenight',
       'GoodForMeal_lunch', 'GoodForMeal_dinner', 'GoodForMeal_brunch',
       'GoodForMeal_breakfast', 'broadCategory']

# replace -1 with 'unknown' so that encoder works
string_cols = final_df.select_dtypes(include='object').columns
final_df[string_cols] = final_df[string_cols].astype(object).replace(-1, 'Unknown')
# encode the categorical featurs
final_df[categorical_features] = final_df[categorical_features].apply(label_encoder.fit_transform)
categorical_features_indices = [final_df.columns.get_loc(name) for name in categorical_features]

if verbose:
    display(final_df.head(10))

# save categorical_features for training the model
np.save(cat_ind_path, categorical_features_indices)
# can't have any null values
if verbose:
    print(final_df.isnull().any())

# force everything to be float32 to save on memory
final_values = final_df.values.astype(np.float32)

print(f"\nMemory usage of X: {final_values.nbytes / (1024**2):.2f} MB\n")
print(f'final_values data types: {final_values.dtype}')

Unnamed: 0,ByAppointmentOnly,BusinessAcceptsCreditCards,BikeParking,RestaurantsPriceRange2,RestaurantsTakeOut,RestaurantsDelivery,Caters,WiFi,WheelchairAccessible,HappyHour,...,GoodForMeal_dessert,GoodForMeal_latenight,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_brunch,GoodForMeal_breakfast,review_count_x,broadCategory,review_count_y,account_age_years
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,79,7,1,10.852841
1,0,2,2,3,2,2,2,0,2,1,...,0,0,0,0,0,0,159,14,164,17.226557
2,1,2,2,2,1,2,0,1,2,2,...,0,0,0,0,0,0,14,15,60,8.692676
3,0,2,0,2,2,1,0,0,0,2,...,0,0,0,0,0,0,26,14,174,14.872005
4,0,2,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,5,15,13,11.482546
5,2,2,0,2,0,0,0,1,0,0,...,0,0,0,0,0,0,149,16,2,4.670773
6,0,2,2,2,2,2,2,1,2,2,...,1,1,2,2,1,1,113,14,11,11.627652
7,0,2,2,1,2,2,2,2,2,2,...,0,1,2,2,1,1,142,14,156,13.36345
8,0,2,1,2,2,2,2,1,2,1,...,0,0,2,2,1,1,92,14,6,8.052019
10,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,63,16,2,12.807666


ByAppointmentOnly             False
BusinessAcceptsCreditCards    False
BikeParking                   False
RestaurantsPriceRange2        False
RestaurantsTakeOut            False
RestaurantsDelivery           False
Caters                        False
WiFi                          False
WheelchairAccessible          False
HappyHour                     False
OutdoorSeating                False
HasTV                         False
RestaurantsReservations       False
DogsAllowed                   False
Alcohol                       False
GoodForKids                   False
RestaurantsAttire             False
RestaurantsTableService       False
RestaurantsGoodForGroups      False
NoiseLevel                    False
BusinessAcceptsBitcoin        False
BusinessParking_garage        False
BusinessParking_street        False
BusinessParking_validated     False
BusinessParking_lot           False
BusinessParking_valet         False
Ambience_romantic             False
Ambience_intimate           

In [9]:
print(y)

0         5
1         5
2         5
3         4
4         5
         ..
499995    5
499996    2
499997    5
499998    5
499999    5
Name: stars, Length: 484831, dtype: Int64


In [10]:
del final_df, test_df, final_test_df, train_df, final_train_df
gc.collect()


# Crash point warning!!!
X = np.hstack((final_values,x_text_reduced))

print(X.dtype)

# dump what we can
del x_text_reduced, final_values
gc.collect()

float64


0

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
gc.collect()

print(X_train.dtype)
np.save(xtrain_path, X_train)
np.save(xtest_path, X_test)
np.save(ytrain_path, y_train)
np.save(ytest_path, y_test)

print('Np arrays saved. Ready for training.')


float64
Np arrays saved. Ready for training.


In [None]:
print(y_train)