In [1]:
import os
from glob import glob
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse.linalg import svds
import numpy as np
from rouge import Rouge
import math

path = r"C:\Users\pavel\Skola\InformationRetrieval\reviews.csv"

# Read data into dataframe
dtypes = {
    'review_id': str,
    'business_name': str,
    'text': str,
    'useful': float
}

df = pd.read_csv(path, usecols=dtypes.keys(), dtype=dtypes, delimiter=';', quotechar='"', nrows=1000000)

# Encode IDs
df['review_id_transformed'] = LabelEncoder().fit_transform(df['review_id'])

# Sort to make sure that ID = index
df.sort_values(by='review_id_transformed', inplace=True)

In [2]:
# Transform tfidf
vec = TfidfVectorizer(stop_words='english', token_pattern=r'[a-zA-Z]{3,}')
X = vec.fit_transform(df['text'])
X = X.T
print(X.shape)

(243626, 1000000)


In [3]:
# Number of latent factors
n_latent_factors = 100

# Compute SVD
U, s, Vt = svds(X, k=n_latent_factors)
S = np.diag(s)

# Representation of documents in latent space
embeddings = Vt.T

In [4]:
# Discounted cumulative gain
def dcg(scores):
    res = 0
    for i, score in enumerate(scores):
        res += score / math.log(i + 2)
    
    return res


In [5]:
# Score results according to benchmarks
def score(metric, benchmarks, results):
    scores = [0 for _ in range(len(results))]

    # Get rouge scores for results and etalons
    rouge = Rouge()
    for i, res in enumerate(results):
        if i % 20 == 0 and i > 0:
            print(i)
        for et in benchmarks:
            et_text = df[df['review_id'] == et]['text'].iloc[0]
            r = rouge.get_scores(res, et_text)[0]
            # print(r)
            scores[i] += r[metric]['r']
            
    return dcg(scores)


In [6]:
n_neighbors = 100
# embeddings = S.dot(Vt).T
embeddings = Vt.T
file = r'..\etalons_1.txt'
with open(file, 'r') as f:
    query = f.readline()
    benchmarks = f.readlines()

query = query[:-1]
benchmarks = [e[:-1] for e in benchmarks]

print('Query: "', query, '"')

query_vec = vec.transform([query])

# Query vector in latent space
query_embd = S.dot(U.T).dot(query_vec.T.todense())

# Find results
nbrs = NearestNeighbors(n_neighbors=n_neighbors)
nbrs.fit(embeddings)
_, neighbors = nbrs.kneighbors(query_embd.T)
neighbors = neighbors[0]

results = df[df['review_id_transformed'].isin(neighbors)]
results = results.drop(results[results['review_id'].isin(benchmarks)].index)
print(results.shape)


Query: " It was a really good pizza. "


(96, 5)


In [7]:
reviews = []
for i in neighbors:
    if i in results['review_id_transformed'].tolist():
        row = results[results['review_id_transformed'] == i]
        text = row['text'].iloc[0]
        reviews.append(text)

gain_1 = score('rouge-1', benchmarks, reviews)
gain_L = score('rouge-l', benchmarks, reviews)

print(gain_1, gain_L)

20


40


60


80


20


40


60


80


55.16196980595818 50.25815605666004


In [8]:
lucene_files = sorted(glob(r'..\lucene*ids.txt'))
print('Found these files: ', lucene_files)

query_files = sorted(glob(os.path.join('..', 'etalons_*.txt')))
for f_name, f_query in zip(lucene_files, query_files):
    # Read review ids returned by lucene
    with open(f_name, 'r') as f:
        ids = f.readlines()
    ids = [id[:-1] for id in ids]
    
    # Read ids of files chosen as etalons
    with open(f_query, 'r') as f:
        benchmarks = f.readlines()
    benchmarks = benchmarks[1:]
    benchmarks = [e[:-1] for e in benchmarks]
    
    ids = [id for id in ids if id not in benchmarks]
    
    print()
    reviews = df[df['review_id'].isin(ids)]
    reviews = [df[df['review_id'] == i] for i in ids]
    reviews = [r['text'].iloc[0] for r in reviews]
    
    gain_1 = score('rouge-1', benchmarks, reviews)
    gain_L = score('rouge-l', benchmarks, reviews)
    
    print(gain_1, gain_L)
    

Found these files:  ['..\\lucene_q1_results_ids.txt']



20


40


60


80


20


40


60


80


55.57428791536812 51.04795324763392
