In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from utils import *
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn import metrics

# Vectorizer Tuning

In [2]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [3]:
lowercase(clean_text(data, 'reviews'), 'reviews').sample(3)

Unnamed: 0,target,reviews
655,neg,my son and i share a perverse predilection for...
927,neg,the lives of older people in the twilight of t...
597,neg,for about twenty minutes into mission impossib...


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [4]:
# Create Pipeline

pipeline = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])


# Set parameters to search (model and vectorizer)

param_grid = {
              'vect__ngram_range': ((1, 1), (2, 2)),
              'vect__binary': (True, False),
              'tfidf__use_idf': (True, False),
              'tfidf__sublinear_tf': (True, False),
              'tfidf__norm': ('l1', 'l2'),
              'clf__alpha': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100)
             }

X = data['reviews']
y = data['target']

gs = GridSearchCV(pipeline, param_grid = param_grid, scoring = 'accuracy', n_jobs = -1, verbose = 2)

gs.fit(X, y)


# Perform grid search on pipeline

print(f"\nPrécision : {round(gs.best_score_*100,2)}%")
print("\nParamètres :")
for key, value in reversed(gs.best_params_.items()):
    print(f" - {key} : {value}")

Fitting 5 folds for each of 224 candidates, totalling 1120 fits

Précision : 85.9%

Paramètres :
 - vect__ngram_range : (2, 2)
 - vect__binary : True
 - tfidf__use_idf : True
 - tfidf__sublinear_tf : True
 - tfidf__norm : l1
 - clf__alpha : 0.01


⚠️ Please push the exercise once you are done 🙃

## 🏁 