In [1]:
import numpy as np
import csv
from textblob import TextBlob

In [2]:
import pandas as pd
import wordcloud
import matplotlib.pyplot as plt


In [3]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [4]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB



In [95]:
df1_review = pd.read_csv('C:/Users/hello/Desktop/Project3/resources/amazon_reviews_v2.csv',low_memory=False)
df1_review.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,20,__label1__,4,Y,Health & Personal Care,B00KNXIDH6,100 Tablet CleanGuard Nightguard Cleaner,and he was satisfied with this,These tablets are especially helpful if you us...
1,21,__label1__,4,N,Home,B00LWRZFAA,Birds Flying Black Tree Branches Wall Sticker ...,best of money value,Looking decent as same shown in photos Thank Y...
2,22,__label1__,4,Y,Health & Personal Care,B00B2YGB9M,"Garcinia Cambogia Pure Extract Supplement, 80%...",It's harder to lose weight the older you get,"I find that the older I get, the harder it is ..."
3,23,__label1__,3,N,Camera,B004TJ6JH6,NEEWER® 160 LED CN-160 Dimmable Ultra High Pow...,So easy to use!!,I have had my camera for about 2 weeks now. Th...
4,24,__label1__,2,Y,Health & Personal Care,B00OBDRLVS,NatureWise Garcinia Cambogia Extract (Not Synt...,Stay Away And Don't Buy It,It is highly recommended not to buy this produ...


#### Insert pos_neg column for Sentiment modeling
     Negative reviews:      1-3 Stars  = 0
     Positive reviews:      4-5 Stars  = 1

In [96]:
df1_review['pos_neg'] = [1 if x > 3 else 0 for x in df1_review.RATING]

In [97]:
df_review = df1_review[['pos_neg','REVIEW_TEXT']].copy()
df_review.head()

Unnamed: 0,pos_neg,REVIEW_TEXT
0,1,These tablets are especially helpful if you us...
1,1,Looking decent as same shown in photos Thank Y...
2,1,"I find that the older I get, the harder it is ..."
3,0,I have had my camera for about 2 weeks now. Th...
4,0,It is highly recommended not to buy this produ...


In [98]:
# replace NaN values with some value of their own.
df_review.REVIEW_TEXT.fillna(' ')

0     These tablets are especially helpful if you us...
1     Looking decent as same shown in photos Thank Y...
2     I find that the older I get, the harder it is ...
3     I have had my camera for about 2 weeks now. Th...
4     It is highly recommended not to buy this produ...
5     The actual power supply is good, but the cable...
6     My personal physician recommended it and I bou...
7     I love the fact that you can easily read the m...
8     I have bought several products from this selle...
9     This is an extremely well made electric blanke...
10    great product for coffee lover that want littl...
11    I figured it would for sure fit a dasani bottl...
12    This dinnerware set provides a very nice aesth...
13    The sides of the case didn't match my phone at...
14    Since I have a low pain tolerance this facial ...
15    The bag looks like a little smaller version th...
16    Bought this to use with my gel eyeliner. The h...
17    This mask is great for the purpose of keep

In [99]:
df_review['REVIEW_TEXT'].values.astype('U')

array(['These tablets are especially helpful if you use the Secure dental adhesive.<br />I bought this stuff for my father. and he was satisfied with this.',
       'Looking decent as same shown in photos Thank You amazon .I saved cost as paint of this type is very costly and as this is digitally print looking decent.',
       "I find that the older I get, the harder it is to lose weight....but I've gotten great results taking garcinia this month. I take it before breakfast, lunch and supper. I don't take the one after supper because I don't tend to eat much in the late evenings. I find it really cuts my down my appetite. I've lost 8 lbs this month, which is WAY more than I've lost in a long time.",
       'I have had my camera for about 2 weeks now. The pictures are great, it is so easy to use. I love it!!',
       'It is highly recommended not to buy this product.  I took the suggested dosage for a week and felt nothing, no extra energy, no loss of appetite, no pounds lost.',
       

In [100]:
df_review['REVIEW_TEXT'].count()

19

In [101]:
df_review['pos_neg'].value_counts()

1    13
0     6
Name: pos_neg, dtype: int64

In [102]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
pos_neg        19 non-null int64
REVIEW_TEXT    19 non-null object
dtypes: int64(1), object(1)
memory usage: 384.0+ bytes


# Preprocessing

In [74]:
stops = stopwords.words('english')

In [75]:
# TextBlob(message).words will give us collection of words from a sentence.
def split_into_tokens(sentence):
    return TextBlob(sentence).words

df_review.REVIEW_TEXT.head().apply(split_into_tokens)

0    [When, least, you, think, so, this, product, w...
1    [Lithium, batteries, are, something, new, intr...
2    [I, purchased, this, swing, for, my, baby, She...
3    [I, was, looking, for, an, inexpensive, desk, ...
4    [I, only, use, it, twice, a, week, and, the, r...
Name: REVIEW_TEXT, dtype: object

In [76]:
# Lemma is converting words into it's root form. 
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    return [word.lemma for word in words]

df_review.REVIEW_TEXT.head().apply(split_into_lemmas)

0    [when, least, you, think, so, this, product, w...
1    [lithium, battery, are, something, new, introd...
2    [i, purchased, this, swing, for, my, baby, she...
3    [i, wa, looking, for, an, inexpensive, desk, c...
4    [i, only, use, it, twice, a, week, and, the, r...
Name: REVIEW_TEXT, dtype: object

# Classification/Sentiment Analysis

In [None]:
reviewcloud = wordcloud.WordCloud(background_color='pink', max_font_size=50, 
                                relative_scaling=1).generate(' '.join(str(n)for n in df_review.REVIEW_TEXT))


In [None]:
fig = plt.figure(figsize=(20, 10))
plt.axis('off')
plt.imshow(reviewcloud);

# Train/Split
#train for the models on the data given. 

In [105]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_review.REVIEW_TEXT, df_review.pos_neg,test_size=0.2, random_state=42)


In [106]:
print("X_train shape: {}".format(X_train.shape), end='\n')
print("y_train shape: {}".format(y_train.shape), end='\n\n')
print("X_test shape: {}".format(X_test.shape), end='\n')
print("y_test shape: {}".format(y_test.shape), end='\n\n')

X_train shape: (15,)
y_train shape: (15,)

X_test shape: (4,)
y_test shape: (4,)



In [108]:
# X_test = pd.Series(X_test.flatten())
# X_test

# CountVectorization
#list of tokens (lemmas) above is converted into a vector that machine learning models can understand.

In [109]:
type(X_test)

pandas.core.series.Series

In [110]:
vectorizer = CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4),
    lowercase = False,)
X_train_vect = vectorizer.fit_transform((X_train).values.astype(str))
print("X_train_vect:\n{}".format(repr(X_train_vect)))

#Pandas astype()methods is used to change data type of a series

X_train_vect:
<15x410 sparse matrix of type '<class 'numpy.int64'>'
	with 689 stored elements in Compressed Sparse Row format>


In [111]:
#vectorizer.vocabulary_
print( len(vectorizer.vocabulary_))

#Each vector has as many dimensions as there are unique words in the corpus:

410


In [112]:
X_train_vect.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 1, 0]], dtype=int64)

# Logistic Regression

In [113]:
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))

Number of features: 410


In [114]:
# giving less weightage to frequently occuring words,
# the term weighting and normalization is done with TF-IDF, using scikit-learn's TfidfTransformer:
transformer = TfidfTransformer(smooth_idf=False)
tfidf_X_train = transformer.fit_transform(X_train_vect)

tfidf_X_train

<15x410 sparse matrix of type '<class 'numpy.float64'>'
	with 689 stored elements in Compressed Sparse Row format>

In [115]:
# tfidf_X_train.toarray()
print(tfidf_X_train.shape)

(15, 410)


In [116]:
len(tfidf_X_train.toarray())

15

In [117]:
# Fitting in the model - logistic Regression
logreg = LogisticRegression(C=0.1).fit(tfidf_X_train, y_train)
logreg

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [124]:
X_test_vect = vectorizer.transform((X_test).values.astype(str))

In [125]:
tfidf_X_test = transformer.transform(X_test_vect)

In [120]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lg', LogisticRegression()),
])

In [121]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.5

In [126]:
#Now, the model logreg is ready to predict on the test data.
log_y_pred = logreg.predict(tfidf_X_test)

ValueError: X has 86 features per sample; expecting 410

In [None]:
logreg_score = accuracy_score(y_test, log_y_pred)
print("Accuracy:   {:.3f}".format(logreg_score))

In [None]:
log_cfm = confusion_matrix(y_test, log_y_pred)
print("Confusion matrix:")

In [None]:
log_f1 = f1_score(y_test, log_y_pred)
print("Logistic Reg - F1 score: {:.3f}".format(log_f1))

# Sentiment Analysis
The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity). The polarity score is a float within the range [-1.0, 1.0]. 
The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [103]:
df_review.to_csv("reviewtext.csv", sep=",", index=None)

In [104]:
infile = "reviewtext.csv"
from textblob.sentiments import NaiveBayesAnalyzer
n_list = list()
p_list = list()
c_list = list()
with open(infile, 'r') as csvfile:
    rows = csv.reader(csvfile)
    for row in rows:
        sentence = row[0]
        blob = TextBlob(sentence,analyzer=NaiveBayesAnalyzer() )
        each_sentiment = blob.sentiment
#         n_list.append(each_sentiment.p_neg)
#         p_list.append(each_sentiment.p_pos)
#         c_list.append(each_sentiment.classification)
#         d = dict()
#         d['classification']=c_list
#         d['p_pos']=p_list
#         d['p_neg']=n_list
        print(each_sentiment)
        
        

Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='pos', p_pos=0.5, p_neg=0.5)
Sentiment(classification='po

In [92]:
d = pd.DataFrame(d)
d

Unnamed: 0,classification,p_pos,p_neg
0,pos,0.5,0.5
1,pos,0.5,0.5
2,pos,0.5,0.5
3,pos,0.5,0.5
4,pos,0.5,0.5
5,pos,0.5,0.5
6,pos,0.5,0.5
7,pos,0.5,0.5
8,pos,0.5,0.5
9,pos,0.5,0.5


In [93]:
correct = 0
count = 0
with open(infile, encoding="utf8",mode = 'r') as file:
    for line in file.read().split('\n'):
            analysis = TextBlob(line)
            if analysis.sentiment.polarity <=0.7:
                correct +=1
            count +=1    
print("Accuracy = {}% via {} samples".format(correct/count*100,count))     

Accuracy = 100.0% via 21 samples


In [None]:
from textblob.sentiments import NaiveBayesAnalyzer


>>> blob = TextBlob("I love this library", analyzer=NaiveBayesAnalyzer())
>>> blob.sentiment