In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from utils import *
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

# Movie Reviews

In [2]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [3]:
lowercase(clean_text(data, 'reviews'), 'reviews').sample(3)

Unnamed: 0,target,reviews
1441,pos,call me crazy but i dont see saving private r...
1846,pos,plot a young recruit gets plucked out of nowh...
770,neg,in french the phrase film noir literally me...


## Bag-of-Words modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a Bag-of-Word representation of the texts.

In [4]:
vect = CountVectorizer()
count_matrix = vect.fit_transform(data['reviews'])
count_array = count_matrix.toarray()
X_bow = pd.DataFrame(data = count_array, columns = vect.get_feature_names())

X = X_bow
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=137)

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_predict = nb.predict(X_test)
precision = round(metrics.accuracy_score(y_test, y_predict)*100,2)

print(f"Précision : {precision}%")

Précision : 85.8%


## N-gram modelling

👇 Using `cross_validate`, score a Multinomial Naive Bayes model trained on a 2-gram Bag-of-Word representation of the texts.

In [5]:
vect = CountVectorizer(ngram_range=(2, 2))
count_matrix = vect.fit_transform(data['reviews'])
count_array = count_matrix.toarray()
X_bow = pd.DataFrame(data = count_array, columns = vect.get_feature_names())

X = X_bow
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=137)

nb = MultinomialNB()

nb.fit(X_train, y_train)

y_predict = nb.predict(X_test)
    
print(f"Précision : {round(metrics.accuracy_score(y_test, y_predict)*100,2)}%")

Précision : 86.8%


⚠️ Please push the exercise once you are done 🙃

## 🏁 