In [1]:
import time
import gc
import lightgbm as lgb
from sklearn.decomposition import TruncatedSVD
import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.set_option('future.no_silent_downcasting', True)
verbose = True

In [2]:
yelp_pseudo_df_path = 'files/reviews_pseudo_labeled.csv'
yelp_business_df_path = 'files/busCleaned.csv'
yelp_user_df_path = 'files/user_cleaned.csv'

xtrain_path = 'models/X_train.npy'
xtest_path = 'models/X_test.npy'
ytrain_path = 'models/y_train.npy'
ytest_path = 'models/y_test.npy'
cat_ind_path = 'models/cat_ind.npy'

final_vectorizer_path = 'files/final_vectorizer.pkl'
final_svd_path = 'files/final_svd.pkl'
# Try stuff so I don't just keep getting errors if something goes wrong
try:
    print('Trying to load pseudo labeled, cleaned business and cleaned user files')
    review_df = pd.read_csv(yelp_pseudo_df_path)
    business_df = pd.read_csv(yelp_business_df_path)
    user_df = pd.read_csv(yelp_user_df_path)
    print('Files loaded successfully')
except FileNotFoundError:
    print('One or more files not found. Cannot continue.')


Trying to load pseudo labeled, cleaned business and cleaned user files
Files loaded successfully


In [3]:
# get everything ready
# in case there are any nan values, drop those
# also drop postal codes, they got letters in there
review_df = review_df.dropna()
business_df = business_df.drop(['postal_code'], axis=1)
business_df = business_df.fillna(-1) # fill in empty values with -1, should learn that means missing/unknown

In [4]:
# merge dataframes
final_df = review_df.merge(business_df, on = 'business_id', how = 'left')
final_df = final_df.merge(user_df, on = 'user_id', how = 'left')

# attributes most likely for a business and user to have
# for checking floating reviews
check_cols = [
    'ByAppointmentOnly', 
    'RestaurantsPriceRange2',
    'BusinessAcceptsCreditCards',
    'RestaurantsPriceRange2',
    'account_age_years'
]
final_df = final_df.dropna(subset = check_cols)

# clear out the original dataframes from memory
del business_df, review_df, user_df
gc.collect()

20

In [5]:
t0 = time.time()
print('Starting vectorization')

# vectorize review text with tf-idf
vizer = TfidfVectorizer(max_features = 100000)
x_text = vizer.fit_transform(final_df['text'])

# reducing vectorized text dimensions, it gives me over 300,000 features
n_components = 300 
svd = TruncatedSVD(n_components=n_components, random_state=42)
x_text_reduced = svd.fit_transform(x_text)

# reduce the text size
text_reduced = x_text_reduced.astype(np.float32)
print(f'text vectorized after {time.time() - t0} seconds')

# save the fitted vectorizers
joblib.dump(vizer, final_vectorizer_path)
joblib.dump(svd, final_svd_path)
# dump them, I need all the memory I can get
del vizer, svd, x_text
gc.collect()

Starting vectorization


KeyboardInterrupt: 

In [None]:
# splits the final data frame into the parts we're going to use
drop_cols = ['user_id', 'business_id', 'text']

final_df = final_df.drop(drop_cols, axis=1)
# convert broad_category to numeric
label_encoder = LabelEncoder()

final_df = final_df.convert_dtypes()

y = final_df['pseudo_label']
final_df = final_df.drop(['pseudo_label'], axis=1)

categorical_features = ['ByAppointmentOnly', 'BusinessAcceptsCreditCards', 'BikeParking',
       'RestaurantsPriceRange2', 'RestaurantsTakeOut', 'RestaurantsDelivery',
       'Caters', 'WiFi', 'WheelchairAccessible', 'HappyHour', 'OutdoorSeating',
       'HasTV', 'RestaurantsReservations', 'DogsAllowed', 'Alcohol',
       'GoodForKids', 'RestaurantsAttire', 'RestaurantsTableService',
       'RestaurantsGoodForGroups', 'NoiseLevel', 'BusinessAcceptsBitcoin',
       'BusinessParking_garage', 'BusinessParking_street',
       'BusinessParking_validated', 'BusinessParking_lot',
       'BusinessParking_valet', 'Ambience_romantic', 'Ambience_intimate',
       'Ambience_touristy', 'Ambience_hipster', 'Ambience_divey',
       'Ambience_classy', 'Ambience_trendy', 'Ambience_upscale',
       'Ambience_casual', 'GoodForMeal_dessert', 'GoodForMeal_latenight',
       'GoodForMeal_lunch', 'GoodForMeal_dinner', 'GoodForMeal_brunch',
       'GoodForMeal_breakfast', 'broadCategory']

# replace -1 with 'unknown' so that encoder works
string_cols = final_df.select_dtypes(include='object').columns
final_df[string_cols] = final_df[string_cols].astype(object).replace(-1, 'Unknown')
# encode the categorical featurs
final_df[categorical_features] = final_df[categorical_features].apply(label_encoder.fit_transform)
categorical_features_indices = [final_df.columns.get_loc(name) for name in categorical_features]

if verbose:
    display(final_df.head(10))

# save categorical_features for training the model
np.save(cat_ind_path, categorical_features_indices)
# can't have any null values
if verbose:
    print(final_df.isnull().any())

# force everything to be float32 to save on memory
final_values = final_df.values.astype(np.float32)

del final_df
gc.collect()

print(f"\nMemory usage of X: {final_values.nbytes / (1024**2):.2f} MB\n")
print(f'final_values data types: {final_values.dtype}')

In [None]:
# Crash point warning!!!
X = np.hstack((final_values,x_text_reduced))

print(X.dtype)

# dump what we can
del x_text_reduced, final_values
gc.collect()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
del X
gc.collect()

print(X_train.dtype)
np.save(xtrain_path, X_train)
np.save(xtest_path, X_test)
np.save(ytrain_path, y_train)
np.save(ytest_path, y_test)

print('Np arrays saved. Ready for training.')
