In [3]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# using the SQLite Table to read data.
con = sqlite3.connect('./database.sqlite') 



#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con) 




# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [4]:
filtered_data.shape #looking at the number of attributes and size of the data
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Data Cleaning: Deduplication


In [5]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [6]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [7]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
#final=final.drop_duplicates(subset={"UserId","ProductId","Time"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [8]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [9]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
display


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [10]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]


In [11]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [None]:
Data-preprocessing

In [12]:
# find sentences containing HTML tags
import re

i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;    

        

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [13]:
import re

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

{'after', 'at', 'when', 'its', 'but', 'then', 'having', 'hasn', 'been', 'mightn', 'more', 'where', 'should', "don't", 'wasn', 'did', 'him', "you're", "weren't", 'into', 'than', 'mustn', 'won', "wasn't", 'so', 'have', 'all', 'am', 'for', 'own', 'doing', 'while', 'aren', 'she', 'about', 'of', 's', 'do', 'in', 'y', 'hadn', 'itself', 'to', "hadn't", 'himself', 'my', 'other', 'both', 'a', 'shouldn', 'ours', "she's", 'are', 'some', "should've", "wouldn't", 'that', 'o', 've', 'myself', 'whom', "shouldn't", 'wouldn', 'under', 'yours', 'through', 'off', 'again', 'needn', 'i', 'don', 'we', 'ma', 'during', "shan't", 'shan', 'just', 'haven', "isn't", 'with', 'be', 'if', 'out', 'his', 'who', 'being', 'not', "haven't", 'here', 'only', 'they', 'this', 'few', 'd', 'can', 'too', 'he', 'will', 'll', 'below', 'what', 'further', 'and', 'those', 'because', 'themselves', 'your', 'her', 'had', 'until', 'most', 'is', "mustn't", "you'll", 't', 'isn', 'by', 'from', 'yourselves', 'were', 'it', 'above', 'me', 'an

In [14]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [15]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 

In [16]:
# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

 Bag of Words (BoW)

In [17]:
final_df = pd.DataFrame(final) # converting table to a dataframe

In [18]:
random_final = final_df.sample(n=10000) # Sampling 10k rows 
random_final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
45938,49985,B00430B73W,AOP5TUC7TTEY0,J,0,0,positive,1310774400,great bar,One of the best tasting bars I've found. Tast...,b'one best tast bar ive found tast like fresh ...
197833,214415,B0037N7GIQ,A40VNMBOUNRLL,"Sara Coppola ""Disappointed""",0,1,positive,1317686400,"Good product, bad packaging on Amazon's part",I order this pretty often and usually the pack...,b'order pretti often usual packag good howev l...
184240,199851,B001E52VJS,A1ZWIEWLLV1HOG,"Beau Pernoski ""pernoski""",4,5,positive,1220486400,tasty but too expensive on Amazon!,buy this stuff all the time but I'm not sure w...,b'buy stuff time sure anyon buy amazon box box...
156765,170011,B003VNCRMW,AYYT5ITL86BCW,Toobie,0,0,positive,1307318400,Great size!,5 of these arrived in the lot. It came in hand...,b'arriv lot came handi didnt add sugar plus ad...
413470,447171,B002CJAOR6,AXRM6XPAUE6IK,"Dorothea A. Warren ""tortie queen""",2,2,positive,1324339200,Very convenient!,I am thrilled i can get Friskies on Subscribe ...,b'thrill get friski subscrib save disabl two o...
138792,150623,B000HDHZZI,A1Y2BBN2TXYAOO,"s8ulbaker ""kirby""",4,4,positive,1268092800,Good Snack,"This is a good product to snack on, the packag...",b'good product snack packag small compani alwa...
79072,85977,B000CMKPDI,A2QIB5H8XFUE2U,"Zippy ""zippy""",27,48,positive,1149206400,Zap it before it hits the ground!,Here's the trick. If you zap it on the ground...,b'here trick zap ground stick grass good espec...
178749,193848,B005IW4WFY,A3QRA8J69DUXFL,ASP,1,1,positive,1341273600,yummy! except hold the blueberries.,These are great! Very tasty and filling. Bel...,b'great tasti fill believ say health nut start...
356723,385851,B000LKZ9KK,A2FCSLMZ98DSZI,Sheri,0,0,positive,1232064000,Best Tasting,I was first turned onto this product from a re...,b'first turn onto product review cook illustr ...
58810,63887,B004SNMAOO,A148W7XE1Y5D07,Raymond F. Ferrin,0,0,positive,1334966400,GOOD,"DUE TO GASTRIC BYPASS SURGERY, THESE ATKINS BA...",b'due gastric bypass surgeri atkin bar awesom ...


In [19]:
random_final = random_final.sort_values('Time') # Sorting the dataframe based on  Time

In [20]:
random_final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
70688,76882,B00002N8SM,A32DW342WBJ6BX,Buttersugar,0,0,positive,948672000,A sure death for flies,I bought a few of these after my apartment was...,b'bought apart infest fruit fli hour trap mani...
346114,374420,B00004CI84,A1ZH086GZYL5MZ,Doug DeBolt,2,2,positive,1013385600,"A little gross, a lot of fun",Michael Keaton was already on his way to being...,b'michael keaton alreadi way major star play g...
409953,443373,B0000U1OFU,AFKKVFJ2DS4EL,Jonathan R. Pauling,0,1,positive,1068076800,Righteous Hot Sauce,I just love this sauce. I put in on everything.,b'love sauc put everyth'
152806,165713,B0000D9N59,A3FE2GUBM8JZ3G,TestMagic Inc.,26,31,positive,1073088000,One of the great cheeses of the world,The only real Parmigiano (Reggiano) is one of ...,b'real parmigiano reggiano one great chees wor...
401781,434430,B0000CA4TK,AJHU1QA9QZUC5,"J. Jackson ""ajacksonsf""",4,5,positive,1073865600,The Best Cuban Coffee,"I also live in exile in San Francisco, but I'm...",b'also live exil san francisco loyal miami roo...
458134,495351,B0083QJU54,AFXMTGF9XLJM1,Former Rater,5,5,positive,1080432000,The REAL Maple Syrup,I have had products from the Coombs farm in my...,b'product coomb farm pantri year mapl syrup ta...
10116,11049,B0000VYKXC,A3L88AH6MABEDL,Javier Denison,0,0,positive,1081209600,Best coffee I've tasted. I try others but alwa...,If you like a strong but smooth flavorful coff...,b'like strong smooth flavor coffe tri other ke...
502450,543231,B0001J2NDW,A43DUOVTS3MZ9,Michael J Collier,0,0,positive,1083715200,The best coco I have ever had,I enjoy hot coco and I must say that this is t...,b'enjoy hot coco must say best ever'
5886,6374,B000084EKB,A1Z54EM24Y40LL,c2,0,0,positive,1091059200,Great food!,This version is ground. NOt my current cat's ...,b'version ground current cat favorit version r...
24069,26321,B000121BY6,A1D6AMH6WH4DBW,Psin Khan,3,4,positive,1092009600,Most excellent carbonated beverage,"One of the things I miss about Austin, TX is t...",b'one thing miss austin littl soft drink orang...


In [40]:
#BoW
count_vect = CountVectorizer() 
final_counts = count_vect.fit_transform(random_final['CleanedText'].values)


In [41]:
type(final_counts) # sparse matrix

scipy.sparse.csr.csr_matrix

In [42]:
final_counts.get_shape() # shape of matrix after Vectorizing

(10000, 13067)

In [44]:
# Standardizing the data with mean = 0 and std.dev = 1
from sklearn.preprocessing import StandardScaler
standardized_data = StandardScaler(with_mean=False).fit_transform(final_counts)
print(standardized_data.shape)

(10000, 13067)




In [29]:
X = standardized_data.toarray() # converting the standardized data to dense array
y = np.array(random_final['Score']) # converting Review attribute to a numpy array

In [31]:
from sklearn.model_selection import train_test_split

# split the data set into train and test 70% train and 30% test 
X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=0)

In [32]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10
N_BB = BernoulliNB() # applying Bernoulli Naive-Bayes

a = np.logspace(0.0, 10.0, num=50, base=1.25) # specifying the range of the hyperparameter alpha for GridSearch

NB_params = {'alpha':a} # hyperparameter to choose  

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
gcv = GridSearchCV(N_BB, NB_params,cv=tss, refit=True) 

# Fit the training data
gcv.fit(X1, y1)

# find the mean score for Cross-Validation using 10 Fold Time Series CV after finding the right hyperparameter 
score_BOW = cross_val_score(gcv, X=X1, y=y1, cv=tss).mean()
print(score_BOW)

0.8419811320754716


In [33]:
gcv.best_params_  # print the best hyperparameter alpha

{'alpha': 9.313225746154785}

In [36]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best alpha
N_BB_op = BernoulliNB(alpha=9.313225746154785)

# fitting the model
N_BB_op.fit(X1, y1)

# predict the response
pred_NB = N_BB_op.predict(X_test)

# print top 20 features for each class for BOW
print("Top 20 Positive Features for BOW and Top 20 Negative Features for BOW ")
print("\n\n")

# defining a function for finding the top features of each class
def top_most_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names() # getting all the feature names
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # retrieve the coefficient & sort them based on values 
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%-15s\t%.4f\t%-15s\t\t%-15s\t%.4f\t%-15s" % (fn_2, coef_1, 'positive', fn_1, coef_2, 'negative'))

        
top_most_features(count_vect, N_BB_op)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test, pred_NB)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test, pred_NB)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test, pred_NB).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test, pred_NB, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test, pred_NB, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test, pred_NB, pos_label='positive'))# Precision


Top 20 Positive Features for BOW and Top 20 Negative Features for BOW 



	like           	-6.4548	positive       		aaaaaahhhhh    	-1.1674	negative       
	tast           	-6.4548	positive       		aaah           	-1.2067	negative       
	love           	-6.4548	positive       		aafco          	-1.2192	negative       
	great          	-6.4548	positive       		aamzon         	-1.2453	negative       
	good           	-6.4548	positive       		abbazabba      	-1.2769	negative       
	flavor         	-6.4548	positive       		abbrevi        	-1.4353	negative       
	use            	-6.4548	positive       		abc            	-1.4707	negative       
	tri            	-6.4548	positive       		abhor          	-1.5158	negative       
	product        	-6.4548	positive       		abnorm         	-1.5220	negative       
	one            	-6.4548	positive       		abomin         	-1.5251	negative       
	make           	-6.4548	positive       		abouy          	-1.6025	negative       
	get            	-6.4548

 TF-IDF

In [21]:
# Taking Tf-idf bigram and vectorizing it
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(random_final['CleanedText'].values)


In [22]:
final_tf_idf.get_shape() # Shape of tf-idf vectorizer


(10000, 236244)

In [31]:
# Standardizing the data with mean=0 and std.dev=1
from sklearn.preprocessing import StandardScaler
standardized_data_tf = StandardScaler(with_mean=False).fit_transform(final_tf_idf)
print(standardized_data_tf.shape)

(10000, 400)


In [32]:
X_tf = standardized_data_tf.toarray() # storing the values after standardization in a dense array
y_tf = np.array(random_final['Score']) # storing the values of Scores in numpy array

In [33]:
# split the data set into train and test, 70% train and 30% test 
from sklearn.model_selection import train_test_split

X2, X_test2, y2, y_test2 = train_test_split(X_tf, y_tf, test_size=0.3, shuffle=False, random_state=0)

In [34]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss_tf = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10
N_BB_tf = BernoulliNB()  # applying Bernoulli Naive-Bayes

atf = np.logspace(0.0, 10.0, num=50, base=1.25) # specifying the range of the hyperparameter alpha for GridSearch

NB_params_tf = {'alpha':atf} # hyperparameter to choose  

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
gcv_tf = GridSearchCV(N_BB_tf, NB_params_tf,cv=tss_tf, refit=True)

# Fit the training data
gcv_tf.fit(X2, y2)

# find the mean score for Cross-Validation using 10 Fold Time Series CV after finding the right hyperparameter 
score_tf = cross_val_score(gcv_tf, X=X2, y=y2, cv=tss_tf).mean()
print(score_tf)

0.8641509433962264


In [35]:
gcv_tf.best_params_  # print the best hyperparameter alpha

{'alpha': 1.0953555355153197}

In [39]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best alpha
N_BB_op_tf = BernoulliNB(alpha=1.0953555355153197)

# fitting the model
N_BB_op_tf.fit(X2, y2)

# predict the response
pred_NB_tf = N_BB_op_tf.predict(X_test2)


# print top 20 features for each class for tf-idf
print("Top 20 Positive Features for Tf-idf and Top 20 Negative Features for Tf-idf ")
print("\n\n")


# defining a function for finding the top features of each class
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()  # getting all the feature names
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # retrieve the coefficient & sort them based on values 
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%-10s\t%.4f\t%-15s\t\t%-10s\t%.4f\t%-15s" % (fn_2, coef_1, 'positive', fn_1, coef_2, 'negative'))
                


show_most_informative_features(tf_idf_vect, N_BB_op_tf)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test2, pred_NB_tf))# accuracy
print("Confusion Matrix\n", confusion_matrix(y_test2, pred_NB_tf))# Confusion matrix
tn, fn, fp, tp = confusion_matrix(y_test2, pred_NB_tf).ravel()# tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test2, pred_NB_tf, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test2, pred_NB_tf, pos_label='positive')) # f1-score
print("Precision = ",precision_score(y_test2, pred_NB_tf, pos_label='positive'))# precision


Top 20 Positive Features for Tf-idf and Top 20 Negative Features for Tf-idf 



	aback two 	-1.3274	positive       		aaf came  	-0.6145	negative       
	abl bing  	-1.2924	positive       		aaf       	-0.6417	negative       
	abdomin fat	-0.9962	positive       		aafco     	-0.6533	negative       
	abl sampl 	-0.7930	positive       		abamectin fipronil	-0.6624	negative       
	absolut real	-0.7870	positive       		aafco requir	-0.6631	negative       
	abil mani 	-0.7723	positive       		abel      	-0.6631	negative       
	abl focus 	-0.7654	positive       		abil make 	-0.6634	negative       
	abl coffe 	-0.7632	positive       		abil choos	-0.6644	negative       
	abil adjust	-0.7549	positive       		aaa       	-0.6657	negative       
	abid seem 	-0.7539	positive       		abi       	-0.6667	negative       
	absolut must	-0.7471	positive       		abl digest	-0.6670	negative       
	absolut ador	-0.7456	positive       		aback first	-0.6690	negative       
	acai mix  	-0.7449	positive       		