In [1]:
import pandas as pd
import os
import nltk
from nltk.stem.snowball import EnglishStemmer
import cv2

nrows = None

# nltk.download()

# Preprocess text

In [2]:
stemmer = EnglishStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

def clean_up(text):
    tokens = tokenizer.tokenize(text)
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [w for w in tokens if not w in stop_words]
    return " ".join(tokens)

def get_negative_comments(comments):
    negative_comments = comments[comments["stars"]<3]["text"].to_list()

    for comment_id in range(len(negative_comments)):
        negative_comments[comment_id] = negative_comments[comment_id].replace('\n', ' ')
        
    return negative_comments


In [35]:
filename = './yelp_dataset/yelp_academic_dataset_review.json'
chunksize = 64

def tokenize_comments():
    c=0
    for comments in pd.read_json(filename, lines=True, chunksize=chunksize):
        negative_comments = get_negative_comments(comments)
        tokenized_text = [*map(clean_up, negative_comments)]
        res_df = pd.DataFrame(data=tokenized_text, columns=['tokenized_text'])
        if c:
            res_df.to_csv('preprocessed_text.csv', mode='a', index=False, header=False)
        else:
            res_df.to_csv('preprocessed_text.csv', mode='a', index=False)
            c=1

# tokenize_comments()


In [3]:
t_filename = './preprocessed_text.csv'

# df = pd.read_csv(t_filename)

Unnamed: 0,tokenized_text
0,stay mani marriott renaiss marriott huge disap...
1,place use cool chill place bunch neanderth bou...
2,set perfect adequ food come close dine chain l...
3,never seem get order correct servic crappi foo...
4,disappoint bolt follow fanni fabric close like...
...,...
95,decid take laptop mactron base friend recommen...
96,1 would like make clear visit offic comment de...
97,awe awe awe servic rude hostess dismiss care b...
98,recommend place ate grandfath halfway eat meal...


In [4]:
def get_most_used():
    freqs = {}
    tokenized_text = df["tokenized_text"].to_list()
    for comment in tokenized_text:
        if isinstance(comment, str):
            comment = comment.split()
            for word in list(set(comment)):
                if word in freqs:
                    freqs[word] += 1
                else:
                    freqs[word] = 1

    freqs_list = [(x, freqs[x]/df.shape[0])for x in freqs]
    freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

    out = [x[0]for x in freqs_list[:500]]

    with open("most_used.txt", "w+") as f:
        for w in out:
            f.write(w)
            f.write('\n') 
            
# get_most_used()


['stay mani marriott renaiss marriott huge disappoint front desk atrium nice starbuck site nice room run old flat screen expect renaiss got hotel via pricelin rate 75 night good deal price true renaiss', 'place use cool chill place bunch neanderth bouncer hop steroid act like whatev want mani better place davi squar glad visit busi sad burren worst place davi', 'set perfect adequ food come close dine chain like chili victoria station barbecu better surpris alway pick coupon linwood restaur com', 'never seem get order correct servic crappi food inconsist gone hill steadili last 6 9 month never go', 'disappoint bolt follow fanni fabric close like fanni select small fabric lean toward contemporari also small stock fabric order howev enough room display select like tri paw closet twice mani cloth rack meant handl woman work time nice help howev mother bought fabric problem employe help us calcul yardag without take repeat consider brought pillow form doubt exact larg mother abl make projec

In [17]:
n_frequent = []

with open("most_used.txt", "r") as f:
    for l in f:
        n_frequent.append(l[:-1])

n_frequent = n_frequent[:250]

def clear_words(tokens):
    if isinstance(tokens, str):
        return " ".join([w for w in tokens.split() if not w in n_frequent])
    else:
        return ""


def del_most_freq():
    df["tokenized_text"] = df["tokenized_text"].apply(clear_words)
    df.to_csv('preprocessed_reviews.csv')
    
# del_most_freq()

# Preprocess images

In [5]:
save_path = "preprocessed_imgs"

pictures = pd.read_json("photos.json", lines=True, nrows=nrows)[["photo_id"]]

pictures.head()

Unnamed: 0,photo_id
0,Un_Og6jfhazVn7CxszkKEw
1,BFE1AFOs27scnnfeBf99ZA
2,7t-C0r1JRdoVD9FS7M-N7Q
3,rLnw0d-YYZvT9kR4y7h7_Q
4,Cv5M8MDw8a5NEWvw2AQ4nw


In [6]:
def hisEqulColor(img):
    ycrcb=cv2.cvtColor(img,cv2.COLOR_BGR2YCR_CB)
    channels=cv2.split(ycrcb)
    cv2.equalizeHist(channels[0],channels[0])
    cv2.merge(channels,ycrcb)
    cv2.cvtColor(ycrcb,cv2.COLOR_YCR_CB2BGR,img)
    return img

def preprocess_imgs():
    done = 1
    for index, row in pictures.iterrows():
        img = cv2.imread(os.path.join("./yelp_photos", f"{row['photo_id']}.jpg"), -1)
        img = cv2.resize(img, (224, 224)) # resize to vgg16 input size
        img = cv2.blur(img,(5, 5)) # apply gaussian blur
        img = hisEqulColor(img) # equalize histogram
        cv2.imwrite(os.path.join(save_path, f"{row['photo_id']}.jpg"), img) # save image
        if not done % 1000:
            print(f"Done {done}/200 000")
        done += 1
        
# preprocess_imgs()

Done 1000/200 000
Done 2000/200 000
Done 3000/200 000
Done 4000/200 000
Done 5000/200 000
Done 6000/200 000
Done 7000/200 000
Done 8000/200 000
Done 9000/200 000
Done 10000/200 000
Done 11000/200 000
Done 12000/200 000
Done 13000/200 000
Done 14000/200 000
Done 15000/200 000
Done 16000/200 000
Done 17000/200 000
Done 18000/200 000
Done 19000/200 000
Done 20000/200 000
Done 21000/200 000
Done 22000/200 000
Done 23000/200 000
Done 24000/200 000
Done 25000/200 000
Done 26000/200 000
Done 27000/200 000
Done 28000/200 000
Done 29000/200 000
Done 30000/200 000
Done 31000/200 000
Done 32000/200 000
Done 33000/200 000
Done 34000/200 000
Done 35000/200 000
Done 36000/200 000
Done 37000/200 000
Done 38000/200 000
Done 39000/200 000
Done 40000/200 000
Done 41000/200 000
Done 42000/200 000
Done 43000/200 000
Done 44000/200 000
Done 45000/200 000
Done 46000/200 000
Done 47000/200 000
Done 48000/200 000
Done 49000/200 000
Done 50000/200 000
Done 51000/200 000
Done 52000/200 000
Done 53000/200 000
Do