In [11]:
import json
from nltk import FreqDist
from nltk import word_tokenize, pos_tag
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import random
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
import numpy as np

In [12]:
# return true if it is non-alpha
def alpha_filter(content):
    pattern = re.compile('^[^a-z]+$')
    if pattern.match(content):
        return True
    else:
        return False

In [13]:
def get_stopwords_list():
    stopwords = nltk.corpus.stopwords.words('english')
    words = open('reserve_words.txt', 'r')
    words_text = words.read()
    words.close()
    reserved_words = nltk.word_tokenize(words_text)
    stopwords = [word for word in stopwords if word not in reserved_words]
    stopwords = list(set(stopwords))
    return stopwords

In [14]:
def get_pos(word_pos_tag):
    # get the words pos tags from pos_tag and return the corresponding wordnet tags
    if word_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif word_pos_tag.startswith('V'):
        return wordnet.VERB
    elif word_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif word_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [15]:
# get alpha words
def alphabetical_words(content):
    # get alphabetical words and remove non-alpha words
    alpha_words = [w for w in content if not alpha_filter(w)]
    return alpha_words

In [16]:
# do tokenization
def do_word_tokenize(content):
    # tokenize words
    content = content.replace("@", " ")
    tokens = nltk.word_tokenize(content)
    return tokens

In [17]:
# do lower the words
def lower_work(content):
    # set all words as lowercase
    words = [w.lower() for w in content]
    return words

In [18]:
# print out the top 50 words
def print_top_words(content):
    # print top 50 frequency
    freq_dist = FreqDist(content)
    top_keys = freq_dist.most_common(50)
    for pair in top_keys:
        print(pair)

In [19]:
# make preprocess
def do_preprocess_data(content):
    tokens_content = do_word_tokenize(content)  # word tokenize
    lower_words = lower_work(tokens_content)  # lower the words
    words_tags = pos_tag(lower_words)  # lemmatization
    word_net_le = WordNetLemmatizer()
    words = []
    for tag in words_tags:
        word_pos = get_pos(tag[1]) or wordnet.NOUN
        words.append(word_net_le.lemmatize(tag[0], pos=word_pos))
    alpha_words = alphabetical_words(words)  # filter non-alpha characters
    stopwords = get_stopwords_list()  # get stopwords list
    filtered_words = [w for w in alpha_words if w not in stopwords]  # filter stop words
    return filtered_words

In [20]:
# get set of words for features
def word_features_generate(all_words_list):
    all_words = nltk.FreqDist(all_words_list)
    word_items = all_words.most_common(2000)
    word_features = [word for (word, freq) in word_items]
    return word_features

In [21]:
# get feature
def linear_document_features(document, word_features, features, index):
    document_words = set(document)
    num = 0
    for word in word_features:
        if word in document_words:
            features[index][num] = features[index][num] + 1
        num = num + 1

In [22]:
# use the feature to train the classifier
def linear_classifier_train(documents, word_features, features):
    index = 0
    rating_values = []
    for (d, c) in documents:
        linear_document_features(d, word_features, features, index)
        rating_values.append(c)
        index = index + 1
    X_train, X_test, Y_train, Y_test = features[2000:], features[:2000],  rating_values[2000:], rating_values[:2000]
    model = linear_model.LinearRegression()
    # model = linear_model.LogisticRegression()
    # model = svm.SVR()
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    print("MSE is:", metrics.mean_squared_error(Y_test, Y_predict))
    print("MAE is:", metrics.mean_absolute_error(Y_test, Y_predict))
    print("R square is: ", metrics.r2_score(Y_test, Y_predict))
    return model

In [27]:
data = []
with open("data.json", 'r') as f:
    one_record = []
    for line in f:
        one_record.append(json.loads(line))
        dict = json.loads(s=line)
        data.append(dict)
random.shuffle(data)
all_words = ' '
positive_words = ' '
negative_words = ' '
neutral_words = ' '
all_words_list = []
documents = []
count = 0
for one in data:
    count = count + 1
    content = one['reviewText']  # review text content
    rating_score = one['overall']  # ratings
    documents.append((content, rating_score))  # content and rating pair
    filter_words = do_preprocess_data(content)  # do pre process
    for word in filter_words:
        all_words = all_words + word + ' '
        all_words_list.append(word)
        if rating_score > 3:
            positive_words = positive_words + word + ' '
        elif rating_score < 3:
            negative_words = negative_words + word + ' '
        else:
            neutral_words = neutral_words + word + ' '
word_features = word_features_generate(all_words_list)
features = [[0 for num2 in range(len(word_features))] for num1 in range(count)]
classifier = linear_classifier_train(documents, word_features, features)
'''
wordcloud_all_words = WordCloud(background_color="white",width=1000, height=860, margin=2).generate(all_words)
plt.imshow(wordcloud_all_words)
plt.axis("off")
plt.show()
wordcloud_positive_words = WordCloud(background_color="white",width=1000, height=860, margin=2).generate(positive_words)
plt.imshow(wordcloud_positive_words)
plt.axis("off")
plt.show()
wordcloud_negative_words = WordCloud(background_color="white",width=1000, height=860, margin=2).generate(negative_words)
plt.imshow(wordcloud_negative_words)
plt.axis("off")
plt.show()
wordcloud_neutral_words = WordCloud(background_color="white",width=1000, height=860, margin=2).generate(neutral_words)
plt.imshow(wordcloud_neutral_words)
plt.axis("off")
plt.show()
'''
sentence_content1 = 'I like this guitar, it is very well'
sentence1_words = do_preprocess_data(sentence_content1)
sentence1_feature = [[0 for num1 in range(len(word_features))]]
linear_document_features(sentence1_words, word_features, sentence1_feature, 0)
print(classifier.predict(sentence1_feature))

-------------------------------------
Predict variable is:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Predict result is:4.42
-------------------------------------
-------------------------------------
Predict variable is:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Predict result is:4.42
-------------------------------------
-------------------------------------
Predict variable is:
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
# use the feature to train the classifier
def model(X_train, X_test, Y_train, Y_test):
    # model = linear_model.LinearRegression()
    # model = linear_model.LogisticRegression()
    model = svm.SVR()
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    print("MSE is:", metrics.mean_squared_error(Y_test, Y_predict))
    print("MAE is:", metrics.mean_absolute_error(Y_test, Y_predict))
    print("R square is: ", metrics.r2_score(Y_test, Y_predict))
    return model

In [24]:
def tf_idf_analysis(data):
    documents = []
    count = 0
    for one in data:
        count = count + 1
        if 'reviewText' not in one:
            continue
        content = one['reviewText']
        rating_score = one['overall']
        filter_words = do_preprocess_data(content)
        words_sentence = ''
        for word in filter_words:
            words_sentence = words_sentence + word + ' '
        documents.append((words_sentence, rating_score))
    train, test = train_test_split(documents, test_size=0.2)
    trains_sentences = []
    trains_sentiments = []
    test_sentences = []
    test_sentiments = []
    for one in train:
        trains_sentences.append(one[0])
        trains_sentiments.append(one[1])
    for one in test:
        test_sentences.append(one[0])
        test_sentiments.append(one[1])
    tf_vect = TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_features=2000, use_idf=True)
    X_train_tfidf = tf_vect.fit_transform(trains_sentences)
    X_test_tfidf = tf_vect.transform(test_sentences)
    y_train = trains_sentiments
    y_test = test_sentiments
    model(X_train_tfidf, X_test_tfidf, y_train, y_test)

In [25]:
import json
from nltk import FreqDist
from nltk import word_tokenize, pos_tag
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import random
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
data = []
with open("data.json", 'r') as f:
    one_record = []
    for line in f:
        one_record.append(json.loads(line))
        dict = json.loads(s=line)
        data.append(dict)
random.shuffle(data)
tf_idf_analysis(data)



MSE is: 0.9518039403926449
MAE is: 0.5393868383870342
R square is:  -0.2023355192741232
