In [1]:
import numpy
import random
from collections import defaultdict
import urllib
import math
import random
import collections
import string
import csv
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import sklearn.metrics
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords

In [2]:
def parseLabeledData(path):
    file=open(path, 'r')
    dataList = []
    for line in csv.reader(file):
        if len(line) == 4:
            dataList.append(
                {"asin":line[0], 
                 "question":line[1],
                 "review":line[2],
                 "label":line[3]}
            )     
    return dataList
        

print("Reading labeled data...")
data = parseLabeledData("/Users/Silvia/Desktop/LabelledData.csv")
queries = [d['question'] for d in data]
reviews = [d['review'] for d in data]
labels = [d['label'] for d in data]
print("done")

Reading labeled data...
done


In [3]:
def parseAllQueries(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

def parseAllReviews(path):
    file = open(path, 'r')
    dataList = defaultdict(lambda: [])
    for line in file:
        line = eval(line)
        dataList[line["asin"]].append(line)
      
    return dataList

print("Reading all reviews & all questions...")
allReviews = parseAllReviews("/Users/Silvia/Downloads/reviews.json")
allQuestions = parseAllQueries("/Users/Silvia/Downloads/qa.json")

# do we have to remove questions that have no reviews or reviews that have no questions??
docSet = []
for entry in allReviews.values():
    for review in entry:
        docSet.append(review["reviewText"])

for entry in allQuestions.values():
    for question in entry:
        docSet.append(question["question"])

print("done")

Reading all reviews & all questions...
done


In [27]:
# @param a word whose frequency in the document we are calculating
# @param document a string of a review or a question
# @return the frequency of term in document div length of document

def tf(term, document):
    count = collections.defaultdict(int)
    for word in document.split():
        count[word] += 1

    return count[term]/len(document.split())

In [13]:
def findCommonWords():
    allWords = defaultdict(int)
    englishStopWords = stopwords.words('english')
    for r in allReviews.values():
        for review in r:
            review = review["reviewText"]
            exclude = set(string.punctuation)
            review = ''.join(ch for ch in review if ch not in exclude)
            for w in review.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1

    for q in allQuestions.values():
        for question in q:
            question = question["question"]
            exclude = set(string.punctuation)
            question = ''.join(ch for ch in question if ch not in exclude)
            for w in question.lower().split():
                if w not in englishStopWords:
                    allWords[w] += 1
    
    
    return sorted(allWords, key=lambda x: -allWords[x])[:1000]

In [14]:
commonWords = findCommonWords()
print(commonWords)

['one', 'use', 'light', 'would', 'like', 'well', 'good', 'great', 'work', 'get', 'used', 'need', 'tool', 'time', 'much', 'also', 'dont', 'easy', 'little', 'really', 'saw', 'works', 'set', 'two', 'product', 'price', 'bought', 'made', 'battery', 'using', 'small', 'nice', 'even', 'im', 'bit', 'batteries', 'quality', 'long', 'better', 'power', 'buy', 'ive', 'could', 'make', 'enough', 'unit', 'tools', 'blade', 'drill', 'led', 'want', 'still', 'way', 'back', 'around', 'lights', 'water', 'first', 'put', 'fit', '2', 'bulb', 'see', 'new', 'switch', 'without', 'bright', 'thing', 'years', 'go', 'got', 'many', 'know', 'size', 'cut', 'plastic', 'find', 'right', 'come', 'old', 'box', 'lot', 'last', 'think', 'door', 'flashlight', 'bulbs', '3', 'recommend', 'keep', 'house', 'sure', 'home', 'another', 'far', 'case', 'cant', 'since', 'doesnt', 'job', 'hold', 'needed', 'something', 'problem', 'seems', 'turn', 'fine', 'makes', 'side', 'easily', 'high', 'say', 'best', 'pretty', 'comes', 'didnt', '4', 'bits

In [15]:
idfDict = defaultdict(float)
for word in commonWords:
    count = 0         
    for doc in docSet:
        if word in doc.lower():
            count += 1
    idfScore = math.log(len(docSet)/(count+1))
     
    idfDict[word] = idfScore

In [16]:
def idf(term):
    return idfDict[term]

In [17]:
def wordToIndex(term):
    if term in commonWords:
        return commonWords.index(term)
    else:
        return -1

In [18]:
def numCommonWords(review, question):
    
    filtered_words = [word for word in review.split() if word not in stopwords.words('english')]
    num = 0
    words = set(filtered_words)
    for word in words:
        countReview = 0
        countQuestion = 0
        countReview = review.split().count(word)
        countQuestion = question.split().count(word)
        
        num = num + min(countReview, countQuestion)
        
    #print(num)
    return num

In [19]:
def lengthDiff(review, question):
    return abs(len(review.split()) - len(question.split()))

In [20]:
def tfidf(document):
    document = document.lower()
    document = ''.join([c for c in document if not (c in string.punctuation)])
    
    feat = collections.defaultdict(int)
    for term in document.split():
        tfscore = tf(term, document)
        idfscore = idf(term)
        feat[term] = tfscore * idfscore
    
    return feat

In [21]:
# queryFeat is a feature vector for the query and reviewFeat is the feature vector for the review
def cosineSimilarity(queryFeat, reviewFeat):
    # Find the words the 2 dictionaries have in common
    querySet = set(queryFeat.keys())
    reviewSet = set(reviewFeat.keys())
    allWords = querySet.union(reviewSet)
    
    # Find the cosine similarity
    numerator = 0
    mag1 = 0
    mag2 = 0
    for word in allWords:
        numerator = numerator + queryFeat[word] * reviewFeat[word]
        mag1 = mag1 + queryFeat[word]**2
        mag2 = mag2 + reviewFeat[word]**2
    if mag1 > 0 and mag2 > 0:
        return (numerator/((mag1*mag2)**0.5))
    else:
        return -1

In [22]:
def feature(review, question):
    feat = [1]
    
    #number of Common Words
    #difference in length
    #length of review
    #length of question
    feat.append(numCommonWords(review, question))
    feat.append(lengthDiff(review,question))
    feat.append(len(review.split()))
    feat.append(len(question.split()))
    cosine = cosineSimilarity(tfidf(review), tfidf(question))
    feat.append(cosine)
    
    return feat

In [23]:
def feature_tfidf(review, question):
    feat = [1]
    feat.append(cosineSimilarity(tfidf(review), tfidf(question)))
    return feat
    
    

In [24]:
def normalize(featList):
    
    max = 0
    min = float('inf')
    for feat in featList:
        if feat > max: max = feat
        if feat < min: min = feat        
    
    for i in range(0,len(featList)-1):
        if (max - min) == 0: 
            max = 1
            min = 0
        featList[i] = (featList[i] - min) / (max - min)

    return featList

In [25]:
def constrain(elems):
    for elem in elems:
        if (elem > 0.5): yield 1
        else: yield 0
        #if elem > 1: yield 1
        #elif elem < 0: yield 0
        #else: yield elem

In [31]:
def pipeline(X, y):

    random.seed(171727)
    
    for j in range(0,len(X[0])):
        featList = []
        for i in range(0,len(X)-1):
            featList.append(X[i][j])
        featList = normalize(featList)
        for i in range(0,len(X)-1):
            X[i][j] = featList[i]
    
    keys = list(range(1, len(labels)))
    points = dict(zip(keys, zip(X, y)))
    random.shuffle(keys)
    X = [points[key][0] for key in keys]
    y = [points[key][1] for key in keys]
    
    X_train = X[:len(X)//2]
    y_train = y[:len(y)//2]
    
    X_test = X[len(X)//2:]
    y_test = y[len(y)//2:]
    
    
    #theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train, rcond=None)
    
    #X_train = numpy.matrix(X_train)
    #y_train = numpy.matrix(y_train).T
    #theta = numpy.matrix(theta).T
    #y_hatTrain = X_train * theta
    
    #X_test = numpy.matrix(X_test)
    #y_test = numpy.matrix(y_test).T
    #y_hatTest = X_test * theta
    
    # Find mean of Training and Test
    #y_barTrain = (sum(y_train) / len(y_train))
    #y_barTest = (sum(y_test) / len(y_test))
    
    #print(r2_score(y_test, y_hatTest))
    
    #clf = LinearSVC(random_state=0, C=1.0)
    #clf.fit(X_train, y_train)
    #y_hatTrain = clf.predict(X_train)
    #y_hatTest = clf.predict(X_test)
    
    # the accuracy score is very .77. Regardless of the C we chose. But the C score DID, in fact, impact our
    # training accuracy
    
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_hatTrain = lr.predict(X_train)
    y_hatTest = lr.predict(X_test)
    
    #print(X_train)
    
    y_hatTrain = list(constrain(y_hatTrain))
    y_hatTest = list(constrain(y_hatTest))
    
    
    
    
    #print(y_hatTrain)
    print(y_test)
    print(y_hatTest)
    
    print("train:")
    #print(sklearn.metrics.r2_score(y_train, y_hatTrain))
    print("accuracy: ", sklearn.metrics.accuracy_score(y_train, y_hatTrain), "\t", \
          "precision: ", sklearn.metrics.precision_score(y_train, y_hatTrain), "\t", \
          "recall: ", sklearn.metrics.recall_score(y_train, y_hatTrain), "\t", \
          "auc: ", sklearn.metrics.roc_auc_score(y_train, y_hatTrain), "\t", \
          "f1: ", sklearn.metrics.f1_score(y_train, y_hatTrain))
          
    print("test:")
    #print(sklearn.metrics.r2_score(y_test, y_hatTest))
    print("accuracy: ", sklearn.metrics.accuracy_score(y_test, y_hatTest), "\t", \
          "precision: ", sklearn.metrics.precision_score(y_test, y_hatTest), "\t", \
          "recall: ", sklearn.metrics.recall_score(y_test, y_hatTest), "\t", \
          "auc: ", sklearn.metrics.roc_auc_score(y_test, y_hatTest), "\t", \
          "f1: ", sklearn.metrics.f1_score(y_test, y_hatTest))
    

print("Our algo")
X = [feature(d["review"], d["question"]) for d in data]
y = [1 if l == "y" else 0 for l in labels]
pipeline(X, y)
print("\n")

print("Not our algo -- (this one actually works) -- or not lol")
X = [feature_tfidf(d["review"], d["question"]) for d in data]
y = [1 if l == "y" else 0 for l in labels]
pipeline(X, y)
print("\n")
    
    


Our algo
[1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
train:
accuracy:  0.8050847457627118 	 precision:  0.75 	 recall:  0.12 	 auc:  0.5546236559139786 	 f1:  0.20689655172413793
test:
accuracy:  0.847457627118644 	 precision:  0.6666666666666666 	 recall:  0.10526315789473684 	 auc:  0.5475810738968634 	 f1:  0.18181818181818182




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
