## Imports

In [1]:
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Loading data

In [2]:
reviews_data = pd.read_csv("movie_review.csv")[['text','tag']].sample(1000, random_state=42).reset_index(drop=True)
reviews_data.head()

Unnamed: 0,text,tag
0,it's like a dream without any appeal .,neg
1,state-of-the-art special effects have never be...,neg
2,some action films have action sequences that a...,neg
3,"a number of reasons , including the fact that ...",neg
4,julie james ( jennifer love hewitt ) and ray b...,neg


## Preparing data

In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_review(review):
    """
    Receives a raw review and clean it using the following steps:
    1. Remove all non-words
    2. Transform the review in lower case
    3. Remove all stop words
    4. Perform stemming

    Args:
        review: the review that iwill be cleaned
    Returns:
        a clean review using the mentioned steps above.
    """
    
    review = re.sub("[^A-Za-z]", " ", review)
    review = review.lower()
    review = word_tokenize(review)
#     review = [stemmer.stem(word) for word in review if word not in set(stopwords.words("english"))]
#     review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    return review

In [4]:
review = reviews_data.text[0]
print(review)

it's like a dream without any appeal .


In [5]:
cleaned_review = clean_review(review)
print(cleaned_review)

it s like a dream without any appeal


In [6]:
corpus = []
for i in range(0, len(reviews_data)):
    review = clean_review(reviews_data.text[i])
    corpus.append(review)

corpus[:5]

['it s like a dream without any appeal',
 'state of the art special effects have never been a carpenter trademark and once again the writer director who seems to have no problem finding work however doesn t waste any of the film s budget in that department',
 'some action films have action sequences that are so conventional our attention is detracted and diverted by other thoughts',
 'a number of reasons including the fact that all those experienced filmmakers behind and in front of the camera did a lousy job',
 'julie james jennifer love hewitt and ray bronson freddie prinze jr are back from the original to star in i still know reprising their roles in typical fashion']

## Extracting features using the Bag-of-Words and TF-idf Model

In [7]:
count_vectorizer = CountVectorizer()
features_bow = count_vectorizer.fit_transform(corpus).toarray()
features_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
tfidf_vectorizer = TfidfVectorizer()
features_tfidf = tfidf_vectorizer.fit_transform(corpus).toarray()
features_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Using features to create a model

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
lr = LogisticRegression()

In [11]:
lr.fit(features_bow, reviews_data.tag.values)

In [12]:
lr.score(features_bow, reviews_data.tag.values)

0.987

In [13]:
lr.fit(features_tfidf, reviews_data.tag.values)

In [14]:
lr.score(features_tfidf, reviews_data.tag.values)

0.938