In [147]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import numpy as np
import os

In [148]:
# Change to home directory (if not already there)
if(os.getcwd() == '/Users/willswindell/Documents/CS 370 ML/Datasets/movies_reviews/pos'):
    os.chdir('../../../Lab 5/labs_ml_naive_bayes')
    
if(os.getcwd() == '/Users/willswindell/Documents/CS 370 ML/Datasets/movies_reviews/neg'):
    os.chdir('../../../Lab 5/labs_ml_naive_bayes')

# Folder Paths
path_pos = "../../Datasets/movies_reviews/pos"
path_neg = "../../Datasets/movies_reviews/neg"

# iterate through and store all pos files
pos_files = []
os.chdir(path_pos)
for file in os.listdir():
    # Check whether file is in text format or not
    if file.endswith(".txt"):
        file_path = f"{path_pos}/{file}"
  
        # add to list of positives
        pos_files.append(file_path)

# Change back to home directory
os.chdir('../../../Lab 5/labs_ml_naive_bayes')
        
# iterate through all neg files
neg_files = []
os.chdir(path_neg)
for file in os.listdir():
    # Check whether file is in text format or not
    if file.endswith(".txt"):
        file_path = f"{path_neg}/{file}"
  
        # add to list of negatives
        neg_files.append(file_path)

        
# Change back to home directory
os.chdir('../../../Lab 5/labs_ml_naive_bayes')

In [149]:
# Read each file and store its content

def readFiles(files):
    contents = []
    for file in files:
        with open(file) as f:
            contents.append(f.read())
    return contents

pos_content = readFiles(pos_files)
neg_content = readFiles(neg_files)


In [150]:
import re

# Returns dictionary of word counts for a text
def get_word_counts(text, all_words):
    wc={}
    words = get_words(text)
    # Loop over all the entries

    for word in words:
        if (word not in stopwords) and (word in all_words):
            wc[word] = wc.get(word,0)+1

    return wc

# splits text into words
def get_words(txt):
    # Split words by all non-alpha characters
    words=re.compile(r'[^A-Z^a-z]+').split(txt)

    # Convert to lowercase
    return [word.lower() for word in words if word!='']


# converts counts into a vector
def get_word_vector(word_list, wc):
    v = [0]*len(word_list)
    for i in range(len(word_list)):
        if word_list[i] in wc:
            v[i] = wc[word_list[i]]
    return v


# prints matrix
def print_word_matrix(docs):
    for d in docs:
        print (d[0], d[1])

In [151]:
stop_words_file = "stop_words.txt"
f = open(stop_words_file, "r", encoding="utf-8")

stopwords = []
for line in f:
    stopwords.append(line.strip())
    
f.close()

print(stopwords[:10])

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost']


In [152]:
all_words = {}
doc_id = 1
vectors = []

for review in pos_content:
    doc_words = get_words(review)
    for w in doc_words :
        if w not in stopwords:
            all_words[w] = all_words.get(w,0)+1
            
for review in neg_content:
    doc_words = get_words(review)
    for w in doc_words :
        if w not in stopwords:
            all_words[w] = all_words.get(w,0)+1
            
unique_words = set()
for w, count in all_words.items():
    if all_words[w] > 1 :
        unique_words.add(w)
        
for review in pos_content:
    vectors.append(["d"+str(doc_id), get_word_counts(review,unique_words)])
    doc_id += 1
    
for review in neg_content:
    vectors.append(["d"+str(doc_id), get_word_counts(review,unique_words)])
    doc_id += 1

unique_words=list(unique_words)
#print("All unique words:",unique_words)

In [153]:
print(len(vectors))
Y = np.concatenate((np.ones(1000), np.zeros(1005)))
print(len(Y))

2005
2005


In [154]:
out = open("reviews_vectors.txt", "w")

# write a header which contains the words themselves
for w in unique_words:
    out.write('\t' + w)
out.write('\n')

# print_word_matrix to file
for i in range(len(vectors)):
    vector = get_word_vector(unique_words, vectors[i][1])
    out.write(vectors[i][0])
    for x in vector:
        out.write('\t' + str(x))
    out.write('\n')
out.close()

In [155]:
reviews_vectors_file = "reviews_vectors.txt"
f = open(reviews_vectors_file, "r", encoding="utf-8")
s = f.read()

In [156]:
def read_vector_file(file_name):
    f = open(file_name)
    lines=[line for line in f]
  
    # First line is the column headers
    colnames=lines[0].strip().split('\t')[:]
    # print(colnames)
    rownames=[]
    data=[]
    for line in lines[1:]:
        p=line.strip().split('\t')
        # First column in each row is the rowname
        if len(p)>1:
            rownames.append(p[0])
            # The data for this row is the remainder of the row
            data.append([float(x) for x in p[1:]])
    return rownames,colnames,data


# This function will transpose the data matrix
def rotatematrix(data):
    newdata=[]
    for i in range(len(data[0])):
        newrow=[data[j][i] for j in range(len(data))]
        newdata.append(newrow)
    return newdata

In [157]:
reviews, words, data = read_vector_file(reviews_vectors_file)

X_train, X_test, Y_train, Y_test = train_test_split(data, Y, test_size=0.2)


In [158]:
clf = MultinomialNB()
clf = clf.fit(X_train, Y_train)

print("Normalized train score:", clf.score(X_train, Y_train))
print("Normalized test score:", clf.score(X_test, Y_test))

Normalized train score: 0.9731920199501247
Normalized test score: 0.8229426433915212


In [159]:
test_files = []
test_path = '5new'
os.chdir(test_path)
for file in os.listdir():
    # Check whether file is in text format or not
    if file.endswith(".txt"):
        file_path = f"{test_path}/{file}"
  
        # add to list of test reviews
        test_files.append(file_path)
        
print(test_files)

os.chdir('../')

['5new/r3.txt', '5new/r2.txt', '5new/r1.txt', '5new/r5.txt', '5new/r4.txt']


In [166]:
test_content = readFiles(test_files)
test_vectors = []
            
for review in test_content:
    test_vectors.append(["d"+str(doc_id), get_word_counts(review,unique_words)])        
        

In [167]:
out = open("test_vectors.txt", "w")

# write a header which contains the words themselves
for w in unique_words:
    out.write('\t' + w)
out.write('\n')

# print_word_matrix to file
for i in range(len(test_vectors)):
    vector = get_word_vector(unique_words, test_vectors[i][1])
    out.write(test_vectors[i][0])
    for x in vector:
        out.write('\t' + str(x))
    out.write('\n')
out.close()

In [168]:
test_vectors_file = "test_vectors.txt"
f = open(test_vectors_file, "r", encoding="utf-8")
s = f.read()

In [170]:
reviews, words, test_data = read_vector_file(test_vectors_file)

test_class = [1, 0, 0, 1, 1]

y_predicted = clf.predict(test_data)

def accuracy(preds, actual):
    denom = len(preds)
    correct = 0
    for i, pred in enumerate(preds):
        if(pred == actual[i]):
            correct += 1
    return((correct/denom)*100)

print("Predicted price:",y_predicted )
print("Actual price:",test_class)
print("Test Data Accuracy: {}%".format(accuracy(y_predicted, test_class)))

Predicted price: [1. 1. 0. 1. 1.]
Actual price: [1, 0, 0, 1, 1]
Test Data Accuracy: 80.0%
