## Einsatz des Naive Bayes Classifier zur Extraktion der Top 10 Keywords pro Sterne-Klasse

Also Basis des Notebooks wurde folgendes Videotutorial genommen: https://www.youtube.com/watch?v=5xDE06RRMFk

In [1]:
import json as j
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [2]:
data = pd.read_csv("./Data_filtered/reviews_Musical_Instruments.csv", sep=";")
data.head()

Unnamed: 0,review,rating
0,"['much', 'write', 'exactly', 'supposed', 'filt...",5.0
1,"['product', 'exactly', 'quite', 'affordablei',...",5.0
2,"['primary', 'job', 'device', 'block', 'breath'...",5.0
3,"['nice', 'windscreen', 'protects', 'mxl', 'mic...",5.0
4,"['pop', 'filter', 'great', 'looks', 'performs'...",5.0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['rating'], test_size=0.2)

pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', MultinomialNB(alpha=1))])

In [6]:
model = pipeline.fit(X_train, y_train)

vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

target_names = ['1', '2', '3', '4', '5']
print("top 10 keywords per class:")
for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

print("accuracy score: " + str(model.score(X_test, y_test)))

print(model.predict(['that was an awesome place. Great food!']))

top 10 keywords per class:
1: star guess maybe useless broke received good thought bad cheap
2: little broke fine great thought bad returned hard good cheap
3: bad fine hard great cheap bit job ok little good
4: hard stars job cheap easy bit fine little great good
5: cheap job highly works great perfect little love easy good great
accuracy score: 0.6702386751095957
[5.]
