In [3]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm_notebook
from collections import defaultdict, Counter
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn.metrics import silhouette_score

In [5]:
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (2 * 13, 2 * 6)

In [6]:
english_stopwords = set(stopwords.words("english"))

In [7]:
uids = []
targets = []
texts = []
hotel_ids = []

words = Counter()
with open('tokenize_reviewContent', 'r') as f:
    for line in tqdm_notebook(f):
        line = line.strip()
        uid, hotel_id, year, mark, target, text = line.split('\t')
        
        targets.append(int(target))
        uids.append(int(uid))
        hotel_ids.append(int(hotel_id))

        res = []
        for word in text.split():
            if word not in english_stopwords and len(word) > 2:
                flag = True

                for alpha in word:
                    if not alpha.isalpha():
                        flag = False
                        break
                
                if flag:
                    res.append(word)
                    words[word] += 1

        texts.append(' '.join(res))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
def del_words(words_count, min_count = 10):
    tokens = Counter()
    for token, val in words_count.items():
        if val >= min_count:
            tokens[token] += val
    return tokens

def word_to_vocab(words):
    vocab = dict()
    for index, word in enumerate(sorted(words.keys())):
        vocab[word] = index
    return vocab

In [16]:
vocab = word_to_vocab(del_words(words, min_count = 5))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(vocabulary = vocab)

In [22]:
%%time
X = vectorizer.fit_transform(texts)

CPU times: user 21.9 s, sys: 420 ms, total: 22.3 s
Wall time: 22.3 s


In [25]:
from sklearn.decomposition import TruncatedSVD

In [26]:
model = TruncatedSVD(n_components = 1000)

In [27]:
%%time
X_svd = model.fit_transform(X)

CPU times: user 21min 54s, sys: 1min 14s, total: 23min 8s
Wall time: 10min 50s


In [30]:
np.save('hotels_svd_1000', X_svd)

In [32]:
from sklearn.neighbors import KDTree

In [None]:
for dim in tqdm_notebook([3, 10, 30, 50, 100, 300, 1000]):
    X_small = X_svd[:, :dim]
    kdt = KDTree(X_small, metric='euclidean')
    kdt.query(X_small, k = 51, return_distance = True)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

In [None]:
with open('hotels_svd_text.pickle', 'wb') as f:
    pick

In [4]:
word_tokenizer = nltk.WordPunctTokenizer()
sent_tokenizer = nltk.sent_tokenize

In [5]:
with open('sent_tokenize_reviewContent', 'w') as out_1:
    with open('tokenize_reviewContent', 'w') as out_2:
        with open('./data/metadata') as f1:
            with open('./data/reviewContent') as f2:
                for line1, line2 in tqdm_notebook(zip(f1, f2)):
                    _, _, mark, target, _ = line1.strip().split()
                    target = int(target)
                    if target == 1:
                        target = 0
                    else:
                        target = 1
                    target = str(target)
                    
                    uid, hotel_id, year, text = line2.strip().split('\t', 3)

                    for sent in sent_tokenizer(text):
                        sent = ' '.join(word_tokenizer.tokenize(sent))
                        out_1.write('\t'.join([uid, hotel_id, year, mark, target, sent]))
                        out_1.write('\n')

                    text = ' '.join(word_tokenizer.tokenize(text))
                    
                    out_2.write('\t'.join([uid, hotel_id, year, mark, target, text]))
                    out_2.write('\n')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
tokenizer = nltk.WordPunctTokenizer()

with open('./data/reviewContent') as f:
    for line in tqdm_notebook(f):
        line = line.strip()
        uid, hotel_id, year, text = line.split('\t', 3)
        for word in sent_tokenizer(text):
            if word not in english_stopwords:
                flag = True
                for alpha in word:
                    if not alpha.isalpha():
                        flag = False
                        break
                if flag:
                    words_count[word] += 1    

In [None]:
tokenizer = nltk.WordPunctTokenizer()

words_count = Counter()

with open('./data/reviewContent') as f:
    for line in tqdm_notebook(f):
        line = line.strip()
        uid, hotel_id, year, text = line.split('\t', 3)
        text = tokenizer.tokenize(text.lower())
        for word in text:
            if word not in english_stopwords:
                flag = True
                for alpha in word:
                    if not alpha.isalpha():
                        flag = False
                        break
                if flag:
                    words_count[word] += 1    

In [None]:
len(words_count)

In [None]:
plt.hist(list(words_count.values()), range=[0, 200000], bins=50, log=True)
plt.xlabel("Counts")
plt.show()

In [None]:
finall_words = del_words(words_count, min_count = 5)

In [None]:
len(finall_words)

In [None]:
plt.hist(list(finall_words.values()), range=[0, 200000], bins=50, log=True)
plt.xlabel("Counts")
plt.show()

In [None]:
bad_words = set()
with open('stem.txt') as f:
    index = 0
    for line in tqdm_notebook(f):
        line = line.strip()
        if len(line.split('\t')) < 4:
            continue
        uid, hotel, date, text = line.split('\t')
        
        for word in text.split():
            for alpha in word:
                if not alpha.isalpha():
                    bad_words.add(word)
                    break

In [None]:
bad_words

In [None]:
uids = []
hotels = []
dates = []
texts = []

hotels_2_index = defaultdict(list)
uids_2_index = defaultdict(list)
with open('stem.txt') as f:
    index = 0
    for line in tqdm_notebook(f):
        line = line.strip()
        if len(line.split('\t')) < 4:
            continue
        uid, hotel, date, text = line.split('\t')
        
        text = [word for word in text.split() if word not in bad_words]
        text = ' '.join(text)
        
        hotels_2_index[int(hotel)].append(index)
        uids_2_index[int(uid)].append(index)
        index += 1
        
        hotels.append(int(hotel))
        uids.append(int(uid))
        dates.append(date)
        texts.append(text)

targets = [0] * len(texts)

with open('./data/metadata') as f:
    for line in f:
        uid, hotel, mark, label, date = line.strip().split()
        label = int(label)
        uid = int(uid)
        if uid in uids_2_index:
            for index in uids_2_index[uid]:
                targets[index] = label
        else:
            print(line)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df = 0.95)
X = vectorizer.fit_transform(texts)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 30)

In [None]:
..X_small = svd.fit_transform(X)
y = np.array(targets)
y += 1
y = np.divide(y, 2)

In [None]:
..y = np.array(targets)
y *= -1
y += 1
y = np.divide(y, 2)

# Log-reg

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_small, y, test_size = 0.25, random_state = 42, stratify = y)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
train_proba = model.predict_proba(X_train)[:, 1]
train_predict = model.predict(X_train)
print('ROC-AUC = {}, F-meausre = {}, RECALL = {}, PRECISION = {}'.format(\
                                                                        roc_auc_score(y_train, train_proba),\
                                                                        f1_score(y_train, train_predict),\
                                                                        recall_score(y_train, train_predict),\
                                                                        precision_score(y_train, train_predict)))

# Clustering

In [None]:
max_len = 0
choosen_hotel = None
for hotel, lst in hotels_2_index.items():
    if len(lst) > max_len:
        print(hotel, len(lst))
        max_len = len(lst)
        choosen_hotel = hotel

In [None]:
X_choosen = X_small[hotels_2_index[choosen_hotel]]
y_choosen = y[hotels_2_index[choosen_hotel]]

X_choosen = (X_choosen.T / np.linalg.norm(X_choosen, axis = 1)).T

In [None]:
len(y_choosen[y_choosen == 0]), len(y_choosen[y_choosen == 1])

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters = 10, max_iter = 1000)

In [None]:
%%time
predict = model.fit_predict(X_choosen)

In [None]:
X_to_fit = X_choosen[:, 5:]
for k in [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:
    model = KMeans(n_clusters = k, max_iter = 1000)
    predict = model.fit_predict(X_to_fit)
    clusters = [[0, 0] for i in range(np.max(predict) - np.min(predict) + 1)]
    for i, cluster in enumerate(predict):
        clusters[cluster][targets[i]] += 1
    print(clusters)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_choosen, y_choosen)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
roc_auc_score(y_choosen, model.predict_proba(X_choosen)[:, 1])

In [None]:
f1_score(y_choosen, model.predict(X_choosen))

In [None]:
for text, target in zip(texts, targets):
    if target == -1:
        print(text)

In [None]:
with open('./data/metadata') as f:
    for line in f:
        uid, hotel, mark, label, date = line.strip().split()
        label = int(label)
        uid = int(uid)
        if uid in uids_2_index:
            if label == -1:
                for index in uids_2_index[uid]:
                    print(texts[index])
                print('-' * 100)

## Look on original text

In [None]:
from collections import defaultdict
from tqdm import tqdm_notebook
import numpy as np
texts = []
uids_2_index = defaultdict(list)
with open('./data/reviewContent') as f:
    for index, line in tqdm_notebook(enumerate(f)):
        uid, hotel, date, text = line.strip().split('\t', 3)
        texts.append(text)
        uids_2_index[int(uid)].append(index)

targets = [0] * len(texts)

with open('./data/metadata') as f:
    for index, line in tqdm_notebook(enumerate(f)):
        uid, hotel, mark, label, date = line.strip().split()
        label = int(label)
        uid = int(uid)
        targets[index] = label
        if uid not in uids_2_index:
            print(line)

In [None]:
all_count = 0
zero_pos_count = 0
one_count = 0
for uid in uids_2_index:
    lst  = uids_2_index[uid]
    pos = 0
    neg = 0
    for index in lst:
        if targets[index] == -1:
            neg += 1
        else:
            pos += 1

    if neg != 0:
        all_count += 1
        if pos == 0:
            zero_pos_count += 1
            if neg == 1:
                one_count += 1
                for index in lst:
                    print('\t' + texts[index])
                print('-' * 50)

In [None]:
all_count, zero_pos_count, one_count