In [None]:
"""
The program allows us to read amazon review data and using different
algorithms carry out a sentimental analysis and process the data
"""
#import packages required
import csv
import re
import string
import pandas as pd
from sklearn.svm import SVC
from collections import Counter
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.neural_network import MLPClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as knc
from sklearn.ensemble import GradientBoostingClassifier as gbc
#Categorizing stop words in one section
stopWords = [] # create an empty list to collect the stopwords
dataSet = []   #create an empty list to collect the data
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
# set up regular expressions
"""
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex
    """
regex_str = [
    r'<[^>]+>',  # HTML tags
    r"(?:[a-z][a-z\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]
#Tokenization of strings
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

def initializeSystem():
    stop = stopwords.words('english') + list(string.punctuation) + ['rt', 'via', 'i\'m', 'us', 'it']
    for x in stop:
        stopWords.append(stemmer.stem(lemmatiser.lemmatize(x, pos="v")))
#Carry our preprocessing of review data
def preprocess(s, lowercase=True):
    tokens = tokens_re.findall(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else stemmer.stem(lemmatiser.lemmatize(token.lower(), pos="v")) for
                  token in tokens]
    return tokens

#Process strings with stop words and return
def processString(string):
    terms_stop = [term for term in preprocess(string) if
                  term not in stopWords and len(str(term)) > 1 and not term.isnumeric()]
    return terms_stop
#Function to manage reading of files
def loadFile(filePath):
    fileRead = open(filePath, "r")  #Read input file
    reader = csv.reader(fileRead, dialect='excel')  #Determine type of file
    for row in reader:
        temp = (row[1], row[-1])
        dataSet.append(temp)
    return dataSet

def prepareSparseMatrix(convertedReviews, decisionAttributes):
    sparseMatrix = []
    for cr in convertedReviews:
        newCr = [0] * len(decisionAttributes)
        for word in cr:
            if word in decisionAttributes:
                index = decisionAttributes.index(word)
                newCr[index] += 1
            else:
                pass
        sparseMatrix.append(newCr)
    return sparseMatrix
#Function to convert reviews to readable strings
def convertReviews(reviews):
    convertedReviews = []
    for a in reviews:
        convertedReviews.append(processString(str(a).lower()))
    return convertedReviews

def getDecisionAttributes(convertedReviews):
    toCount = []
    decisionAttributes = []
    for a in convertedReviews:
        toCount.append(" ".join(a))  #Join the strings
    str1 = ""
    for a in toCount:
        str1 += "".join(a)
    x = Counter(str1.split(" "))
    for (k, v) in x.most_common(min(500, len(x))):
        decisionAttributes.append(k)
    return decisionAttributes
#Function to train and process input data
def model_data(training_data):
    dtc = DecisionTreeClassifier(random_state=9, min_samples_split=5)
    dtc.fit(training_data['data'], training_data['result'])

    nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
    nn.fit(training_data['data'], training_data['result'])

    svc = SVC(C=100, kernel="linear")
    svc.fit(training_data['data'], training_data['result'])

    rfc = RFC(n_estimators=10, criterion='entropy', max_depth=10, min_samples_split=5, bootstrap='true', random_state=None)
    rfc.fit(training_data['data'], training_data['result'])


    knc_map = knc(n_neighbors=15, weights='distance')
    knc_map.fit(training_data['data'], training_data['result'])

    gbc_map = gbc(n_estimators=150, verbose=0)
    gbc_map.fit(training_data['data'], training_data['result'])

    return {
        'Decision Tree Classifier': dtc,
        'Neural Networks': nn,
        'Support Vector Machines': svc,
        'Random Forest Classification': rfc,
        'k Nearest Neighbours': knc_map,
        'Gradient Boosting Classifier': gbc_map
    }

#Function to test the train data
def test_models(test_data, models):
    print("Prediction rating:\n")
    for model in models:
        prediction = models[model].score(test_data['data'], test_data['result'])*100.00
        print(str(model) + ": " + "%.2f" % prediction + "%")   #Print the results


initializeSystem()
#Calling all the functions used in the data
#Load the test data 
training_data = loadFile("Data1.csv")
trainDataFeaturesReviews = pd.DataFrame(training_data, columns=["review", "rating"])
targetRating = (trainDataFeaturesReviews['rating'])
targetReview = trainDataFeaturesReviews['review']
trainReviews = convertReviews(targetReview)
decisionAttributes = getDecisionAttributes(trainReviews)
trainSparseMatrix = prepareSparseMatrix(trainReviews, decisionAttributes)
dataFeatures = pd.DataFrame(trainSparseMatrix, columns=decisionAttributes)
training_data = {
    'data': dataFeatures,
    'result': targetRating
}

#Load the test data
test_data = loadFile("Data2.csv")
testDataFeaturesReviews = pd.DataFrame(test_data, columns=["review", "rating"])
testReview = testDataFeaturesReviews['review']
testRating = testDataFeaturesReviews['rating']
testSparseMatrix = prepareSparseMatrix(convertReviews(testReview), decisionAttributes)
testDataFeatures = pd.DataFrame(testSparseMatrix, columns=decisionAttributes)
test_data = {
    'data': testDataFeatures,
    'result': testRating
}

models = model_data(training_data)
test_models(test_data, models)