In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline



In [2]:
df = pd.read_csv("Restaurant_Reviews.tsv", sep="\t")

### Test de preprocessing ###
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer


# df['Review'] = df['Review'].str.replace('[^\w\s]', '')

# Tokenization
# stop_words = set(stopwords.words('english'))
# df['Review'] = df['Review'].apply(lambda x: [word for word in word_tokenize(x)])

# # Stemming
# # stemmer = PorterStemmer()
# # df['Review'] = df['Review'].apply(lambda x: [stemmer.stem(word) for word in x])
# df['Review'] = df['Review'].apply(lambda x: ' '.join(x))

In [3]:
# test of n_grams
def nozip_ngrams(text, n=4):
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Liked'], test_size=0.2, random_state=42)

vectorizers = [CountVectorizer, TfidfVectorizer]
n_grams = [(1, 1), (1, 2), (1, 3)]
classiffiers = [MultinomialNB, LogisticRegression, RandomForestClassifier]

res = []

def test_vect_gram_class(vector, n_gram, classifier):
    combined_features = FeatureUnion([
        ('vect', vector(ngram_range=n_gram)),
        # ('manual', vector(tokenizer=nozip_ngrams)) # test n_gram manual
    ])
    
    pipeline = Pipeline([
        ('features', combined_features),
        ('clf', classifier())
    ])

    pipeline.fit(X_train, y_train)

    y_predictions = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_predictions)
    res.append([vector.__name__, n_gram, classifier.__name__, accuracy, pipeline])


for vectorizer in vectorizers:
    for n_gram in n_grams:
        for classifier in classiffiers:
            test_vect_gram_class(vectorizer, n_gram, classifier)

In [5]:
def get_max(res):
    max_acc = 0
    max_res = []
    for r in res:
        if r[3] > max_acc:
            max_acc = r[3]
            max_res = r
    return max_res
get_max(res)

['TfidfVectorizer',
 (1, 3),
 'MultinomialNB',
 0.83,
 Pipeline(steps=[('features',
                  FeatureUnion(transformer_list=[('vect',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)))])),
                 ('clf', MultinomialNB())])]

In [7]:
best_model = get_max(res)[4]

text = ["I love this restaurant", "Terrible, hated it", "It was good", "I went to this restaurant."]
best_model.predict(text)

array([1, 0, 1, 0], dtype=int64)