In [1]:
import numpy
import random
from collections import defaultdict
import urllib
import math
import random
import collections
import string
import csv
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import svm
import sklearn.metrics
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

In [2]:
def parseLabeledData(path):
    with open(path, 'r') as file:
        for line in file:
            line = line.split(',')
            yield({"asin":line[0],
                 "question":line[1],
                 "review":line[2],
                 "answer":line[3],
                 "relevance":float(line[4])
                })
        

print("Reading labeled data...")
data = list(parseLabeledData("C:/Users/Moi/Downloads/highestReviewData.csv"))
#data = parseLabeledData("/Users/Silvia/Desktop/New Data - Sheet1.csv")
asins = [d['asin'] for d in data]
queries = [d['question'] for d in data]
reviews = [d['review'] for d in data]
answers = [d['answer'] for d in data]
relevances = [d['relevance'] for d in data]
print("done")

Reading labeled data...
done


In [3]:
def parseAllQueries(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

def parseAllReviews(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

print("Reading all reviews & all questions...")

allReviews = parseAllReviews("C:/Users/Moi/Downloads/reviews.json")
allQuestions = parseAllQueries("C:/Users/Moi/Downloads/qa.json")

#allReviews = parseAllReviews("/Users/Silvia/Downloads/reviews.json")
#allQuestions = parseAllQueries("/Users/Silvia/Downloads/qa.json")

# do we have to remove questions that have no reviews or reviews that have no questions??
docSet = []
for entry in allReviews.values():
    for review in entry:
        docSet.append(review["reviewText"])

for entry in allQuestions.values():
    for question in entry:
        docSet.append(question["question"])

docLen = [len(d.split()) for d in docSet]
avgdl = sum(docLen) / len(docLen)

print("done")

Reading all reviews & all questions...
done


In [4]:
def countAllWords():
    allWords = defaultdict(int)
    englishStopWords = stopwords.words('english')
    for r in allReviews.values():
        for review in r:
            review = review["reviewText"]
            exclude = set(string.punctuation)
            review = ''.join(ch for ch in review if ch not in exclude)
            for w in review.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1

    for q in allQuestions.values():
        for question in q:
            question = question["question"]
            exclude = set(string.punctuation)
            question = ''.join(ch for ch in question if ch not in exclude)
            for w in question.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1
    
    
    return allWords

allWords = countAllWords()

In [5]:
commonWords = sorted(allWords, key=lambda x: -allWords[x])[:5000]

In [6]:
def wordToIndex(term):
    if term in commonWords:
        return commonWords.index(term)
    else:
        return -1

In [7]:
bagCache = {}

def bagOfWords(document, length):
    if (document, length) in bagCache:
        return bagCache[(document, length)]
    
    bag = [0]*length
    
    exclude = set(string.punctuation)
    doc = ''.join(ch for ch in document if ch not in exclude)
    doc = doc.lower().split()
    
    for term in doc:
        index = wordToIndex(term)
        
        if index >= 0 and index < length:
            bag[index] = doc.count(term)
            
    bagCache[(document, length)] = bag
    
    return bag

In [8]:
def pairwiseProduct(review, question, length):
    reviewBag = bagOfWords(review, length)
    questionBag = bagOfWords(question, length)
        
    bagFeat = [0]*length
    
    for i in range(0, length):
        bagFeat[i] = reviewBag[i] * questionBag[i]
        
    #for i in range(0, length):
        #if reviewBag[i] > 0 or questionBag[i] > 0:
            #print(commonWords[i], reviewBag[i], questionBag[i], bagFeat[i])
        
    return bagFeat        

In [9]:
def featForOnlyQuestion(question, length):
    return ([1]+bagOfWords(question, length))

In [352]:
def featForQuestionAndRelevantReview(review, question):
    feat = [1]
    feat += pairwiseProduct(review, question, 500)
    return feat

In [353]:
def normalize(featList):
    
    max = 0
    min = float('inf')
    for feat in featList:
        if feat > max: max = feat
        if feat < min: min = feat        
    
    for i in range(0,len(featList)-1):
        if (max - min) == 0: 
            max = 1
            min = 0
        featList[i] = (featList[i] - min) / (max - min)

    return featList

In [354]:
def train(X, y): 
    lr = LogisticRegression()
    lr.fit(X, y)
    
    return lr


In [355]:
def test(y, y_hat):
    #print(sklearn.metrics.r2_score(y, y_hat))
    
    
    accuracy = sklearn.metrics.accuracy_score(y, y_hat)
    precision = sklearn.metrics.precision_score(y, y_hat)
    recall = sklearn.metrics.recall_score(y, y_hat)
    
    return "{0:.2f}, {1:.2f}, {2:.2f}".format(accuracy, precision, recall)

In [356]:
def constrain(elems, point):    
    for elem in elems:
        if (elem > point): yield 1
        else: yield 0

In [357]:
random.seed(505)

thresh = 0.5

yesPoint = [(featForQuestionAndRelevantReview(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
            for d, i in zip(data, range(0, len(data))) if d["answer"] == "Y" and d["relevance"] > thresh]
noPoint  = [(featForQuestionAndRelevantReview(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
            for d, i in zip(data, range(0, len(data))) if d["answer"] == "N" and d["relevance"] > thresh]

#yesPoint = [(featForOnlyQuestion(d["question"], 1000), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "Y" and d["relevance"] > thresh]
#noPoint  = [(featForOnlyQuestion(d["question"], 1000), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "N" and d["relevance"] > thresh]

#yesPoint = random.sample(yesPoint, len(noPoint))

points = yesPoint + noPoint
random.shuffle(points)

X = [p[0] for p in points]
y = [p[1] for p in points]
i = [p[2] for p in points]

print(len(X))

1896


In [358]:
X_train = X[:len(X)*2//3]
y_train = y[:len(X)*2//3]
i_train = i[:len(X)*2//3]

X_test = X[len(X)*2//3:]
y_test = y[len(X)*2//3:]

#X_test = [featForQuestionAndRelevantReview(d["review"], d["question"]) for d in data]
#y_test = [1 if d["answer"] == "Y" else 0 for d in data]
#i_test = list(range(0, len(data)))
                                           
print(len(X_train), len(y_train), len(X_test), len(y_test))

1264 1264 632 632


In [359]:
lr = train(X_train, y_train)

In [360]:
print("Predict with one question and the most relevant review!")

y_hatTrain = [p[1] for p in lr.predict_proba(X_train)]
y_hatTest  = [p[1] for p in lr.predict_proba(X_test)]

#y_hatTrain = [p[1] for i, p in zip(i_train, lr.predict_proba(X_train)) if relevances[i] > thresh]
#y_hatTest  = [p[1] for i, p in zip(i_test, lr.predict_proba(X_test))  if relevances[i] > thresh]

y_train_r = [p for i, p in zip(i_train, y_train) if relevances[i] > thresh]
y_test_r  = [p for i, p in zip(i_test, y_test)  if relevances[i] > thresh]

for cutoff in range(0, 20):
    y_hatTrain_c = list(constrain(y_hatTrain, cutoff / 20))
    y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))
    #print(len(y_hatTest_c), len(y_hatTest), len(X_test), len(y_test))
    train_string = test(y_train, y_hatTrain_c)
    test_string = test(y_test, y_hatTest_c)

    print(cutoff / 20, ":\t", train_string, "\t", test_string)

print()

Predict with one question and the most relevant review!
0.0 :	 0.80, 0.80, 1.00 	 0.80, 0.80, 1.00
0.05 :	 0.80, 0.80, 1.00 	 0.79, 0.80, 0.99
0.1 :	 0.81, 0.81, 1.00 	 0.79, 0.80, 0.99
0.15 :	 0.81, 0.81, 1.00 	 0.79, 0.80, 0.98
0.2 :	 0.81, 0.81, 1.00 	 0.79, 0.80, 0.98
0.25 :	 0.82, 0.82, 1.00 	 0.79, 0.80, 0.98
0.3 :	 0.83, 0.82, 1.00 	 0.78, 0.80, 0.97
0.35 :	 0.83, 0.83, 1.00 	 0.78, 0.80, 0.96
0.4 :	 0.83, 0.83, 1.00 	 0.78, 0.81, 0.96
0.45 :	 0.84, 0.84, 0.99 	 0.78, 0.81, 0.95
0.5 :	 0.84, 0.84, 0.99 	 0.78, 0.81, 0.95
0.55 :	 0.85, 0.85, 0.98 	 0.78, 0.82, 0.94
0.6 :	 0.86, 0.87, 0.98 	 0.78, 0.82, 0.93
0.65 :	 0.86, 0.87, 0.96 	 0.78, 0.83, 0.90
0.7 :	 0.86, 0.88, 0.95 	 0.78, 0.84, 0.88
0.75 :	 0.83, 0.89, 0.89 	 0.75, 0.86, 0.83
0.8 :	 0.75, 0.90, 0.78 	 0.70, 0.87, 0.73
0.85 :	 0.55, 0.98, 0.44 	 0.50, 0.92, 0.41
0.9 :	 0.44, 1.00, 0.30 	 0.39, 0.93, 0.26
0.95 :	 0.34, 1.00, 0.17 	 0.32, 0.92, 0.16



In [361]:
def featForQuestionAndReviews(reviews, relevances, question):    
    feat = [1]
    
    reviewBag = [0]*500
    
    totalRelevance = sum(relevances)
    
    for (review, relevance) in zip(reviews, relevances):
        bow = bagOfWords(review, 500)
        reviewBag = [x + y * relevance / totalRelevance for x, y in zip(reviewBag, bow)]

    questionBag = bagOfWords(question, 500)
    
    bagFeat = [0]*500
    
    for i in range(0, 500):
        bagFeat[i] = reviewBag[i] * questionBag[i]
    
    feat += bagFeat
    
    return feat

In [362]:
random.seed(505)

thresh = 0.5

yesPoint = []
noPoint = []

i = 0
while i < len(data):
    highestSoFar = 0
    
    qReviews = []
    qRelevances = []

    question = data[i]["question"]
    answer = data[i]["answer"]
    
    while (i < len(data) and data[i]["question"] == question):        
        if (data[i]["relevance"] > thresh):
            highestSoFar = data[i]["relevance"]
            qReviews += [data[i]["review"]]
            qRelevances += [data[i]["relevance"]]
        i += 1
    
    if (answer == "Y" and highestSoFar != 0):
        yesPoint += [(featForQuestionAndReviews(qReviews, qRelevances, question), 1)]
    elif (highestSoFar != 0):
        noPoint  += [(featForQuestionAndReviews(qReviews, qRelevances, question), 0)]

#yesPoint = [(featForQuestionAndReviews(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "Y" and d["relevance"] > 0.5]
#noPoint  = [(featForQuestionAndReviews(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "N" and d["relevance"] > 0.5]

#yesPoint = random.sample(yesPoint, len(noPoint))

points = yesPoint + noPoint
random.shuffle(points)

X = [p[0] for p in points]
y = [p[1] for p in points]

print(len(X))

924


In [363]:
X_train = X[:len(X)*2//3]
y_train = y[:len(X)*2//3]

X_test = X[len(X)*2//3:]
y_test = y[len(X)*2//3:]

print(len(X_train), len(y_train), len(X_test), len(y_test))

616 616 308 308


In [364]:
lr = train(X_train, y_train)

In [366]:
print("Predict with one question and all the most relevant review!")

y_hatTrain = [p[1] for p in lr.predict_proba(X_train)]
y_hatTest  = [p[1] for p in lr.predict_proba(X_test)]

for cutoff in range(0, 20):
    y_hatTrain_c = list(constrain(y_hatTrain, cutoff / 20))
    y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))
    train_string = test(y_train, y_hatTrain_c)
    test_string = test(y_test, y_hatTest_c)

    print(cutoff / 20, ":\t", train_string, "\t", test_string)

print()

Predict with one question and all the most relevant review!
0.0 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.05 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.1 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.15 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.2 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.25 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.3 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.35 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.4 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.45 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.5 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.55 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.6 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.65 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.7 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.75 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.8 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.85 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.9 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00
0.95 :	 0.78, 0.78, 1.00 	 0.76, 0.76, 1.00

