In [1]:
import numpy
import random
from collections import defaultdict
from numpy import median
import urllib
import math
import random
import collections
import string
import csv
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
import sklearn.metrics
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

In [2]:
def parseLabeledData(path):
    with open(path, 'r') as file:
        for line in file:
            line = line.split(',')
            yield({"asin":line[0],
                 "question":line[1],
                 "review":line[2],
                 "answer":line[3],
                 "relevance":float(line[4])
                })
        

print("Reading labeled data...")
data = list(parseLabeledData("C:/Users/Moi/Downloads/highestReviewData.csv"))
#data = parseLabeledData("/Users/Silvia/Desktop/New Data - Sheet1.csv")
asins = [d['asin'] for d in data]
queries = [d['question'] for d in data]
reviews = [d['review'] for d in data]
answers = [d['answer'] for d in data]
relevances = [d['relevance'] for d in data]
print("done")

Reading labeled data...
done


In [3]:
def parseAllQueries(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

def parseAllReviews(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

print("Reading all reviews & all questions...")

allReviews = parseAllReviews("C:/Users/Moi/Downloads/reviews.json")
allQuestions = parseAllQueries("C:/Users/Moi/Downloads/qa.json")

#allReviews = parseAllReviews("/Users/Silvia/Downloads/reviews.json")
#allQuestions = parseAllQueries("/Users/Silvia/Downloads/qa.json")

# do we have to remove questions that have no reviews or reviews that have no questions??
docSet = []
for entry in allReviews.values():
    for review in entry:
        docSet.append(review["reviewText"])

for entry in allQuestions.values():
    for question in entry:
        docSet.append(question["question"])

docLen = [len(d.split()) for d in docSet]
avgdl = sum(docLen) / len(docLen)

print("done")

Reading all reviews & all questions...
done


In [4]:
def countAllWords():
    allWords = defaultdict(int)
    englishStopWords = stopwords.words('english')
    for r in allReviews.values():
        for review in r:
            review = review["reviewText"]
            exclude = set(string.punctuation)
            review = ''.join(ch for ch in review if ch not in exclude)
            for w in review.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1

    for q in allQuestions.values():
        for question in q:
            question = question["question"]
            exclude = set(string.punctuation)
            question = ''.join(ch for ch in question if ch not in exclude)
            for w in question.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1
    
    
    return allWords

allWords = countAllWords()

In [5]:
commonWords = sorted(allWords, key=lambda x: -allWords[x])[:5000]

In [6]:
def wordToIndex(term):
    if term in commonWords:
        return commonWords.index(term)
    else:
        return -1

In [7]:
bagCache = {}

def bagOfWords(document, length):
    if (document, length) in bagCache:
        return bagCache[(document, length)]
    
    bag = [0]*length
    
    exclude = set(string.punctuation)
    doc = ''.join(ch for ch in document if ch not in exclude)
    doc = doc.lower().split()
    
    for term in doc:
        index = wordToIndex(term)
        
        if index >= 0 and index < length:
            bag[index] = doc.count(term)
            
    bagCache[(document, length)] = bag
    
    return bag

In [8]:
def pairwiseProduct(review, question, length):
    reviewBag = bagOfWords(review, length)
    questionBag = bagOfWords(question, length)
        
    bagFeat = [0]*length
    
    for i in range(0, length):
        bagFeat[i] = reviewBag[i] * questionBag[i]
        
    #for i in range(0, length):
        #if reviewBag[i] > 0 or questionBag[i] > 0:
            #print(commonWords[i], reviewBag[i], questionBag[i], bagFeat[i])
        
    return bagFeat        

In [54]:
def tf(term, document):
    count = collections.defaultdict(int)
    exclude = set(string.punctuation)
    document = ''.join(ch for ch in document if ch not in exclude)
    for word in document.split():
        count[word] += 1

    return count[term]/(len(document.split()) + 1)

In [55]:
idfDict = defaultdict(float)

def idf(term):
    term = term.lower()
    if (term in idfDict):
        return idfDict[term]

    count = 0
    for doc in docSet:
        #exclude = set(string.punctuation)
        #doc = ''.join(ch for ch in doc if ch not in exclude)
        if term in doc.lower():
            count += 1
        
    idfScore = math.log(1 + len(docSet) / (count+1))
    idfDict[term] = idfScore
    return idfScore

In [56]:
okapidict = {}

def OkapiBM25(review, question, k1, b):
    if ((review, question, k1, b) in okapidict):
        return okapidict[review, question, k1, b]
    
    question = question.lower()
    question = ''.join([c for c in question if not (c in string.punctuation)])
    
    score = 0
    for q in question.split():
        num = tf(q, review) * (k1 + 1)
        den = tf(q, review) + k1 * (1 - b + b*len(review.split()) / avgdl) 
        score += idf(q) * num / den
        
    #print(score, review, question)
    okapidict[review, question, k1, b] = score
    return score

In [57]:
tfidfdict = {}

def tfidf(document):
    if (document in tfidfdict):
        return tfidfdict[document]
    
    doc = document.lower()
    doc = ''.join([c for c in doc if not (c in string.punctuation)])
        
    feat = collections.defaultdict(int)
    for term in doc.split():
        tfscore = tf(term, doc)
        idfscore = idf(term)
        feat[term] = tfscore * idfscore
        
    tfidfdict[document] = feat
    return feat

In [58]:
# queryFeat is a feature vector for the query and reviewFeat is the feature vector for the review
def cosineSimilarity(queryFeat, reviewFeat):
    # Find the words the 2 dictionaries have in common
    querySet = set(queryFeat.keys())
    reviewSet = set(reviewFeat.keys())
    allWords = querySet.union(reviewSet)
    
    # Find the cosine similarity
    numerator = 0
    mag1 = 0
    mag2 = 0
    for word in allWords:
        numerator = numerator + queryFeat[word] * reviewFeat[word]
        mag1 = mag1 + queryFeat[word]**2
        mag2 = mag2 + reviewFeat[word]**2
    if mag1 > 0 and mag2 > 0:
        return (numerator/((mag1*mag2)**0.5))
    else:
        return -1

In [9]:
def featForOnlyQuestion(question, length):
    return ([1]+bagOfWords(question, length))

In [10]:
def featForQuestionAndRelevantReview(review, question, length = 1000):
    feat = [1]
    feat += pairwiseProduct(review, question, length)
    return feat

In [11]:
def normalize(featList):
    
    max = 0
    min = float('inf')
    for feat in featList:
        if feat > max: max = feat
        if feat < min: min = feat        
    
    for i in range(0,len(featList)-1):
        if (max - min) == 0: 
            max = 1
            min = 0
        featList[i] = (featList[i] - min) / (max - min)

    return featList

In [12]:
def train(X, y): 
    lr = LogisticRegression()
    lr.fit(X, y)
    
    return lr


In [13]:
def test(y, y_hat):
    #print(sklearn.metrics.r2_score(y, y_hat))
    
    
    accuracy = sklearn.metrics.accuracy_score(y, y_hat)
    precision = sklearn.metrics.precision_score(y, y_hat)
    recall = sklearn.metrics.recall_score(y, y_hat)
    
    return "{0:.3f}, {1:.3f}, {2:.3f}".format(accuracy, precision, recall)

In [14]:
def constrain(elems, point):    
    for elem in elems:
        if (elem > point): yield 1
        else: yield 0

In [15]:
random.seed(505)

thresh = 0.3

yesPoint = [(featForQuestionAndRelevantReview(d["review"], d["question"], 500), 1 if d["answer"] == "Y" else 0, d["relevance"])
            for d, i in zip(data, range(0, len(data))) if d["answer"] == "Y" and d["relevance"] > thresh]
noPoint  = [(featForQuestionAndRelevantReview(d["review"], d["question"], 500), 1 if d["answer"] == "Y" else 0, d["relevance"])
            for d, i in zip(data, range(0, len(data))) if d["answer"] == "N" and d["relevance"] > thresh]

#yesPoint = [(featForOnlyQuestion(d["question"], 1000), 1 if d["answer"] == "Y" else 0)
#            for d in data if d["answer"] == "Y" and d["relevance"] > thresh]
#noPoint  = [(featForOnlyQuestion(d["question"], 1000), 1 if d["answer"] == "Y" else 0)
#            for d in data if d["answer"] == "N" and d["relevance"] > thresh]

#yesPoint = [(featForOnlyQuestion(d["question"], 2000)
#             + [d["relevance"]],
#             1 if d["answer"] == "Y" else 0)
#            for d in data if d["answer"] == "Y" and d["relevance"] > thresh]
#noPoint  = [(featForOnlyQuestion(d["question"], 2000)
#             + [d["relevance"]],
#             1 if d["answer"] == "Y" else 0)
#            for d in data if d["answer"] == "N" and d["relevance"] > thresh]

#yesPoint = random.sample(yesPoint, len(noPoint))

points = yesPoint + noPoint
random.shuffle(points)

X = [p[0] for p in points]
y = [p[1] for p in points]

print(len(X))

6655


In [16]:
X_train = X[:len(X)*2//3]
y_train = y[:len(X)*2//3]

X_test = X[len(X)*2//3:]
y_test = y[len(X)*2//3:]

#X_test = [featForQuestionAndRelevantReview(d["review"], d["question"]) for d in data]
#y_test = [1 if d["answer"] == "Y" else 0 for d in data]
#i_test = list(range(0, len(data)))
                                           
print(len(X_train), len(y_train), len(X_test), len(y_test))

4436 4436 2219 2219


In [18]:
random.seed(505)

thresh = 0.0

points = []

i = 0
qNum = 0
while i < len(data):
    
    qReviews = []
    qRelevances = []

    question = data[i]["question"]
    answer = data[i]["answer"]

    while (i < len(data) and data[i]["question"] == question):        
        if (data[i]["relevance"] > thresh):
            qReviews += [data[i]["review"]]
            qRelevances += [data[i]["relevance"]]
        i += 1

    if (len(qReviews) > 0):
        for qReview, qRelevance in zip(qReviews, qRelevances):
            points += [(
                featForOnlyQuestion(question, 200) + featForQuestionAndRelevantReview(qReview, question, 200), 
                1 if answer == "Y" else 0,
                qNum
            )]
    qNum += 1




X_train = [p[0] for p in points if p[2] % 2 == 0]
y_train = [p[1] for p in points if p[2] % 2 == 0]

X_test = [p[0] for p in points if p[2] % 2 == 1]
y_test = [p[1] for p in points if p[2] % 2 == 1]

print(len(X_train), len(y_train), len(X_test), len(y_test))

73796 73796 74760 74760


In [19]:
lr = train(X_train, y_train)

In [20]:
print("Predict with one question and the most relevant review!")

y_hatTrain = [p[1] for p in lr.predict_proba(X_train)]
y_hatTest  = [p[1] for p in lr.predict_proba(X_test)]

#y_hatTrain = [p[1] for i, p in zip(i_train, lr.predict_proba(X_train)) if relevances[i] > thresh]
#y_hatTest  = [p[1] for i, p in zip(i_test, lr.predict_proba(X_test))  if relevances[i] > thresh]

for cutoff in range(10, 11):
    y_hatTrain_c = list(constrain(y_hatTrain, cutoff / 20))
    y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))
    #print(len(y_hatTest_c), len(y_hatTest), len(X_test), len(y_test))
    train_string = test(y_train, y_hatTrain_c)
    test_string = test(y_test, y_hatTest_c)

    print(cutoff / 20, ":\t", train_string, "\t", test_string)

print()

Predict with one question and the most relevant review!
0.5 :	 0.718, 0.723, 0.959 	 0.664, 0.692, 0.916



In [21]:
confidence = [abs(y - 0.5)*2 for y in y_hatTest]
med = numpy.median(confidence)

print(med)

y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > med]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > med]

print(test(y_test2, y_hatTest_c2))

0.3857342114246163
0.732, 0.751, 0.957


In [22]:
def featForQuestionAndReviews(reviews, relevances, question, length):    
    feat = [1]
    
    reviewBag = [0]*length
    
    relevances = [r for r in relevances]
    
    #totalRelevance = sum(relevances)
    
    med = numpy.median(relevances)
    
    thresh = 0.5
    count = 0
    for (review, relevance) in zip(reviews, relevances):
        if relevance >= thresh or relevance >= med:
            count += 1
    
    for (review, relevance) in zip(reviews, relevances):
        if (relevance >= thresh or relevance >= med):
            bow = bagOfWords(review, length)
            reviewBag = [x + y * (relevance) / count for x, y in zip(reviewBag, bow)]
            

    questionBag = bagOfWords(question, length)
    
    bagFeat = [0]*length
    
    for i in range(0, length):
        bagFeat[i] = questionBag[i] * reviewBag[i]
        
    cos = sum(bagFeat)
    

    feat += questionBag
    feat += [cos]
    feat += reviewBag
    
    return feat

In [68]:
random.seed(15)

thresh = 0.0

points = []

i = 0
while i < len(data):

    qReviews = []
    qRelevances = []

    question = data[i]["question"]
    answer = data[i]["answer"]

    while (i < len(data) and data[i]["question"] == question):        
        if (data[i]["relevance"] > thresh):
            qReviews += [data[i]["review"]]
            qRelevances += [data[i]["relevance"]]
        i += 1

    if (len(qReviews) > 0):

        featureVector = [([1] + [cosineSimilarity(tfidf(qReviews[0]), tfidf(question))], 1 if answer == "Y" else 0)]


        #featureVector = [(featForQuestionAndReviews(qReviews, qRelevances, question, 200), 1 if answer == "Y" else 0)]
        #for _ in range(0, len(qReviews)):
        points += featureVector

#yesPoint = [(featForQuestionAndReviews(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "Y" and d["relevance"] > 0.5]
#noPoint  = [(featForQuestionAndReviews(d["review"], d["question"]), 1 if d["answer"] == "Y" else 0, i)
#            for d, i in zip(data, range(0, len(data))) if d["answer"] == "N" and d["relevance"] > 0.5]

#yesPoint = random.sample(yesPoint, len(noPoint))

random.shuffle(points)

X = [p[0] for p in points]
y = [p[1] for p in points]

print(len(X))

8280


In [69]:
X_train = X[:len(X)//2]
y_train = y[:len(X)//2]

X_test = X[len(X)//2:]
y_test = y[len(X)//2:]

print(len(X_train), len(y_train), len(X_test), len(y_test))

4140 4140 4140 4140


In [70]:
lr = train(X_train, y_train)

In [74]:
print("Predict with one question and all the most relevant review!")

y_hatTrain = [1 for p in lr.predict_proba(X_train)]
y_hatTest  = [1 for p in lr.predict_proba(X_test)]

for cutoff in range(10, 11):
    y_hatTrain_c = list(constrain(y_hatTrain, cutoff / 20))
    y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))
    train_string = test(y_train, y_hatTrain_c)
    test_string = test(y_test, y_hatTest_c)

    print(cutoff / 20, ":\t", train_string, "\t", test_string)

print()

Predict with one question and all the most relevant review!
0.5 :	 0.682, 0.682, 1.000 	 0.686, 0.686, 1.000



In [75]:
confidence = [abs(y - 0.5)*2 for y in y_hatTest]
med = numpy.median(confidence)

print(med)

y_hatTest_c = list(constrain(y_hatTest, cutoff / 20))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > med]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > med]

print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .9]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .9]

print(test(y_test2, y_hatTest_c2))

1.0
nan, 0.000, 0.000
0.686, 0.686, 1.000


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [76]:
y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > 0]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > 0]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .1]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .1]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .2]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .2]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .3]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .3]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .4]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .4]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .5]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .5]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .6]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .6]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .7]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .7]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .8]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .8]
print(test(y_test2, y_hatTest_c2))

y_test2 = [y for y, y_hat in zip(y_test, y_hatTest) if abs(y_hat-0.5)*2 > .9]
y_hatTest_c2 = [y_c for y_c, y_hat in zip(y_hatTest_c, y_hatTest) if abs(y_hat-0.5)*2 > .9]
print(test(y_test2, y_hatTest_c2))


0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
0.686, 0.686, 1.000
