In [1]:
import pandas as pd 
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix
from sklearn.pipeline import Pipeline

In [2]:
review_df = pd.read_csv('Amazon_Reviews.csv')
dump = review_df
#load the datasets 
dump

Unnamed: 0,Review,Label
0,Stuning even for the non-gamer: This sound tr...,1
1,The best soundtrack ever to anything.: I'm re...,1
2,Amazing!: This soundtrack is my favorite musi...,1
3,Excellent Soundtrack: I truly like this sound...,1
4,"Remember, Pull Your Jaw Off The Floor After H...",1
...,...,...
194,A Book That Is Worth a Second Look: This book...,1
195,Best game ever: This games makes even amazing...,1
196,Guitar in Absentia: With all due respect to a...,0
197,Stiff and Smells like drying paint: You get w...,0


In [3]:
'''
1.tokanize 
2.Remove stopwords
3.lematize
4.Build the tf idf matrix 
5.Train the model 

'''

'\n1.tokanize \n2.Remove stopwords\n3.lematize\n4.Build the tf idf matrix \n5.Train the model \n\n'

In [4]:
word_tokenize('Sun rises in the east')

['Sun', 'rises', 'in', 'the', 'east']

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
stopwords_en = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocesser(reviews):
    tokens = tokenizer.tokenize(reviews)
    pure_tokens = [token for token in tokens if token.lower() not in stopwords_en]
    lemma_tokens = [lemmatizer.lemmatize(pure_token,pos ='v') for pure_token in pure_tokens]
    return ' '.join(lemma_tokens) 
preprocesser('The sound sytem was good')

'sound sytem good'

In [7]:
review_df['Review'] = review_df['Review'].apply(preprocesser)

In [8]:
output = review_df['Label']

In [9]:
review_df.drop(columns=['Label'],inplace = True)

In [10]:
X_train, X_test , y_train,y_test = train_test_split(review_df,output,test_size = 0.2,random_state=1)

In [11]:
vectorizers = TfidfVectorizer()
train_df = vectorizers.fit_transform(X_train['Review'])
test_df = vectorizers.transform(X_test['Review'])

In [12]:
mnb = MultinomialNB()

In [13]:
mnb.fit(train_df,y_train)

In [14]:
y_pred = mnb.predict(test_df)


In [15]:
confusion_matrix(y_test,y_pred)
precision_score(y_test,y_pred)
recall_score(y_test,y_pred)
f1_score(y_test,y_pred)
accuracy_score(y_test,y_pred)

0.525

In [16]:
import pickle

In [17]:
filename = 'sentimental_analysis.sav'

In [18]:
pickle.dump(mnb,open(filename,'wb'))

In [19]:
#loading the saved model 
loaded_model = pickle.load(open('sentimental_analysis.sav','rb'))

In [20]:
pipline = Pipeline([('tfidf',vectorizers),('clf',mnb)])

In [21]:
pipline.fit(X_train['Review'],y_train)

In [22]:
y_predict = pipline.predict(X_test['Review'])
len(y_predict)

40

In [23]:
import joblib
joblib.dump(pipline,"3rd_model_pipeline.pk")

['3rd_model_pipeline.pk']

In [24]:
predection = pipline.predict(['Guitar in Absentia: With all due respect to a..'])

In [25]:
if predection == 1:
    print( 'Positive')
else:
    print('Negative')

Negative


In [26]:
model = joblib.load("3rd_model_pipeline.pk")

In [27]:
new_review = ['BadProduct']

In [28]:
model.predict(new_review)

array([1], dtype=int64)