In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tqdm import tqdm

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yellow_flash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
business_file = "final_business_CA.gzip"
reviews_file = "final_review_CA.gzip"
users_file = "final_data_user_yelp.gzip"

In [4]:
### Do data preprocessing containing dictionary of user, businesses and star ratings
reviews_df = pd.read_pickle(reviews_file)

In [5]:
reviews_df.sample(frac=1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
6145586,DeNuoyyXhCdMSmbvWQbSRg,kM4SXczHBRKA2tlijNzeJA,Ss6f5v_t5Vddiaz4ZOxL0g,5,3,1,0,If you are from a northern European background...,2018-05-28 23:48:50
5224609,2M8VQDW_5XtTaIqL-Z27jw,EnOSuYE66wwiedOi4iLEeg,v1Uesklh8DpEufYOhTq4iA,2,0,0,0,"Like one person said, it is like a motel...and...",2016-12-31 02:33:24
2143653,2u0W22FmbGJtHWJYWrCY8Q,jK3ew0Vb7aWoRy-oGNwPUA,KC8_Rx4Orlsz8LIonCYXsA,5,0,0,0,I just wanted to shout out the blonde bartende...,2015-11-18 22:10:35
4098272,5ivfR_FkeP1aU2SbyGwDRw,HPGE2PP4VWxbMeltFRefQA,tf6S06LD09IFDHUpOPqPaw,5,1,0,0,Best tacos hand down in San Diego (at least sh...,2021-06-28 23:48:54
182690,WYl3yhJg5YEj2_bBhhzVZA,R0dYtzZ0y2LDMoUTBPlFLg,gebiRewfieSdtt17PTW6Zg,2,0,0,0,I went to Something's Fishy on Saturday for an...,2011-05-04 18:30:30
...,...,...,...,...,...,...,...,...,...
4237825,U-9VYk-D-VeRjJ67Z8cDng,tcVQWUjoojuqVX1p6olXqw,y4x_wAQvUxvTR8iV-JP4Jg,5,0,0,0,Visited this place for the first time this pas...,2017-11-19 18:53:57
3454094,8GnhC-dzIYL7usiJOvLe-w,B5s_DCLVrBLrL8U6TEVlwA,L6nIOUwcTGgQKExGHmvTzQ,3,6,7,3,Downgrade; the crazy lady yelled at me several...,2015-06-09 22:52:42
4417564,tklzuteHAgxD_b4DzmVTaA,U-qCwBv5CZaHAk-OjdyUkQ,8Vo6LN9gqULhXzoxj5k6kQ,5,0,0,0,I visited this little gem of a restaurant on m...,2015-11-24 17:51:04
1325776,Wq_SdHLqWJ0URa5CYHIJ5g,jWUMOf1VyNehtcQkdkmTQQ,ghLjxj4HoSHdflBEz2lIqA,4,6,2,2,Need some food this coronavirus season? Buy a ...,2020-03-16 00:26:27


In [6]:
users_review_df = reviews_df.groupby('user_id')[['text', 'date']].agg(list).reset_index()

In [7]:
business_review_df = reviews_df.groupby('business_id')[['text', 'date']].agg(list).reset_index()

In [8]:
def text_preprocess(review):
    punctuation = set(string.punctuation)
    stemmer = PorterStemmer()
    stop_W = stopwords.words("english")
    r = ''.join([c for c in review.lower() if (not c in punctuation) and c.isalpha()])
    word_list = []
    for w in r.split():
        w = stemmer.stem(w)
        if w not in stop_W:
            word_list.append(w)
    return word_list
    

In [9]:
model = TfidfVectorizer(stop_words='english')

In [10]:
tf_idf_df = {'user_id': [], 'business_id': [], 'tf_idf_hours_matrix': []}

In [12]:
for idx, df in tqdm(reviews_df[:1].iterrows()):
    user_id = df['user_id']
    business_id = df['business_id']
    user_corpus = users_review_df[users_review_df['user_id'] == user_id]['text'].tolist()[0]
    business_corpus = business_review_df[business_review_df['business_id'] == business_id]['text'].tolist()[0]
    tfidf = model.fit_transform(user_corpus + business_corpus)
    user_tidf = tfidf[:len(user_corpus)]
    business_tfidf = tfidf[len(user_corpus):]
    print(user_corpus, user_tidf)
    print("########################")
    print(business_corpus, business_tfidf)
    similarity_matrix = (user_tidf @ business_tfidf.T).toarray()
    user_review_hours = [(pd.Timestamp.now() - timestamp) / pd.Timedelta(hours=1) for timestamp in
                         users_review_df[users_review_df['user_id'] == user_id]['date'].tolist()[0]]
    business_review_hours = [(pd.Timestamp.now() - timestamp) / pd.Timedelta(hours=1) for timestamp in
                            business_review_df[business_review_df['business_id'] == business_id]['date'].tolist()[0]]
    hours_array = np.transpose([np.tile(user_review_hours, len(business_review_hours)),
                                np.repeat(business_review_hours, len(user_review_hours))]).reshape((len(user_review_hours), len(business_review_hours), 2))
    tfidf_feature = np.concatenate((similarity_matrix[..., np.newaxis], hours_array), axis=2)
    tf_idf_df["user_id"].append(user_id)
    tf_idf_df["business_id"].append(business_id)
    tf_idf_df["tf_idf_hours_matrix"].append(tfidf_feature)
    

0it [00:00, ?it/s]

1it [00:00, 11.11it/s]

["Had a party of 6 here for hibachi. Our waitress brought our separate sushi orders on one plate so we couldn't really tell who's was who's and forgot several items on an order. I understand making mistakes but the restaraunt was really quiet so we were kind of surprised. Usually hibachi is a fun lively experience and our  cook  said maybe three words, but he cooked very well his name was Francisco. Service was fishy, food was pretty good, and im hoping it was just an off night here. But for the money I wouldn't go back.", "just moved right down the street from derfs and I'm fully stoked that it's so close! The hickory burger makes my mouth water, and the service is outstanding. My server was pilar and it was some of the best service ever shes awesome. Thanks derfs!", 'Sarah Jacob, the promotional manager really allowed me to enjoy my experience at sharkeez  and guided me to the best menu options and drink deals. Very friendly service and fun bar! Would recommend!']   (0, 3456)	0.14617




In [14]:
model.get_feature_names_out().shape

(3501,)

In [46]:
tf_idf_dataframe = pd.DataFrame(tf_idf_df)

In [47]:
tf_idf_dataframe.to_pickle("final_CA100_tf_idf.gzip")