In [7]:
import os
from glob import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds
import numpy as np
from rouge import Rouge
import math

path = r"C:\Users\pavel\Skola\InformationRetrieval\reviews.csv"

# Read data into dataframe
dtypes = {
    'review_id': str,
    'business_name': str,
    'text': str,
    'useful': float
}

df = pd.read_csv(path, usecols=dtypes.keys(), dtype=dtypes, delimiter=';', quotechar='"', nrows=1000000)

# Encode IDs
df['review_id_transformed'] = LabelEncoder().fit_transform(df['review_id'])

# Sort to make sure that ID = index
df.sort_values(by='review_id_transformed', inplace=True)

In [8]:
# Transform tfidf
vec = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]{3,}')
X = vec.fit_transform(df['text'])
X = X.T
print(X.shape)

(243626, 1000000)


In [9]:
# Number of latent factors
n_latent_factors = 100

# Compute SVD
U, s, Vt = svds(X, k=n_latent_factors)
S = np.diag(s)

# Representation of documents in latent space
embeddings = Vt.T

In [10]:
# Discounted cumulative gain
def dcg(scores):
    res = 0
    for i, score in enumerate(scores):
        res += score / math.log(i + 2)
    
    return res


In [11]:
# Score results according to benchmarks
def score(metric, benchmarks, results):
    scores = [0 for _ in range(len(results))]

    # Get rouge scores for results and etalons
    rouge = Rouge()
    for i, res in enumerate(results):
        if i % 20 == 0 and i > 0:
            print(i)
        for et in benchmarks:
            et_text = df[df['review_id'] == et]['text'].iloc[0]
            r = rouge.get_scores(res, et_text)[0]
            # print(r)
            scores[i] += r[metric]['r']
            
    return dcg(scores)


In [12]:
from time import time
n_neighbors = 100
# embeddings = S.dot(Vt).T
embeddings = Vt.T
file = r'..\etalons_3.txt'
with open(file, 'r') as f:
    query = f.readline()
    benchmarks = f.readlines()

query = query[:-1]
benchmarks = [e[:-1] for e in benchmarks]

print('Query: "', query, '"')

start = time()
query_vec = vec.transform([query])

# Query vector in latent space
query_embd = S.dot(U.T).dot(query_vec.T.todense())

# Find results
nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=4, metric='cosine')
nbrs.fit(embeddings)
dist, neighbors = nbrs.kneighbors(query_embd.T)
neighbors = neighbors[0]
print('done in ', time() - start)
results = df[df['review_id_transformed'].isin(neighbors)]
results = results.drop(results[results['review_id'].isin(benchmarks)].index)
print(results.shape)


Query: " 'recommended burger restaurants' "


done in  7.561911344528198


(98, 5)
[[0.08749427 0.09216553 0.09517244 0.09549658 0.09566841 0.09748466
  0.09771338 0.09823904 0.09866347 0.09951902 0.09966602 0.09971278
  0.10019776 0.10063332 0.1007125  0.10078634 0.10080144 0.10199687
  0.10201588 0.1021267  0.1028807  0.10300368 0.10302037 0.10335461
  0.10373298 0.10373539 0.10386757 0.10387268 0.10395817 0.10402786
  0.10415095 0.1042101  0.10426666 0.10444728 0.10445793 0.10465465
  0.10493952 0.10526967 0.10568834 0.1057031  0.10600228 0.10607141
  0.10644499 0.10644904 0.10688063 0.10691853 0.10704703 0.10716147
  0.10734067 0.10820501 0.10872634 0.10879153 0.10904492 0.1110894
  0.11117171 0.11120008 0.11150189 0.11179108 0.11194627 0.11202163
  0.11203992 0.11212308 0.11219212 0.11229762 0.11230991 0.11236789
  0.11237269 0.11239994 0.1125596  0.11332335 0.11388829 0.1139606
  0.11435462 0.11458132 0.11473489 0.11489824 0.11494794 0.11509276
  0.11548871 0.11595315 0.11684817 0.116916   0.11723594 0.11734054
  0.11797422 0.11803814 0.11821958 0.11831

In [13]:
reviews = []
for i in neighbors:
    if i in results['review_id_transformed'].tolist():
        row = results[results['review_id_transformed'] == i]
        text = row['text'].iloc[0]
        reviews.append(text)

gain_1 = score('rouge-1', benchmarks, reviews)
gain_L = score('rouge-l', benchmarks, reviews)

print(gain_1, gain_L)

20


40


60


80


20


40


60


80


51.14248517144406 46.846839939606056


In [14]:
lucene_file = r'..\lucene_q3_results_ids.txt'

query_file = r'..\etalons_3.txt'

# Read review ids returned by lucene
with open(lucene_file, 'r') as f:
    ids = f.readlines()
ids = [id[:-1] for id in ids]

# Read ids of files chosen as etalons
with open(query_file, 'r') as f:
    benchmarks = f.readlines()
benchmarks = benchmarks[1:]
benchmarks = [e[:-1] for e in benchmarks]

ids = [id for id in ids if id not in benchmarks]

# reviews = df[df['review_id'].isin(ids)]
reviews = [df[df['review_id'] == i] for i in ids]
reviews = [r['text'].iloc[0] for r in reviews]

gain_1 = score('rouge-1', benchmarks, reviews)
gain_L = score('rouge-l', benchmarks, reviews)

print(gain_1, gain_L)


20


40


60


80


20


40


60


80
