In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Bag-of-Words (BoW) and TF-IDF for Creating Features
- Word Embedding is where we can represent the text using numerical vectors
- Bag-of-Words (BoW)  and Term Frequency-Inverse Document Frequency (TF-IDF) are word embedding techniques that help us convert text sentences into numeric vectors.

## Import data

In [None]:
data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
data.info()

## Initial EDA

In [None]:
# shuffle the DataFrame rows and create subset of data for quick exploration and idea of results
sample_data = data.sample(frac = 0.2)
sample_data.info()

In [None]:
sample_data.head()

In [None]:
sample_data['sentiment'].value_counts()

Looks like categories are fairly evenly distibuted. This is good.

In [None]:
# Split sample data into train and test sets
from sklearn.model_selection import train_test_split

X = sample_data["review"]
y = sample_data["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

 To create basic bag of words, use CountVectorizer
 - Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer
 - CountVectorizer supports counts of N-grams of words or consecutive characters
 
 - https://towardsdatascience.com/basics-of-countvectorizer-e26677900f9c

In [None]:
# Using CountVectorizer to create BoW
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer() # max_df=0.9, min_df = 0.15
X_train_counts = count_vect.fit_transform(X_train)

#CountVectorizer outputs a sparse matrix, can convert back to dense array using:
X_train_counts_array = X_train_counts.toarray()

#Create df from array for easier viewing
df = pd.DataFrame(data=X_train_counts_array, columns = count_vect.get_feature_names_out())

print (X_train_counts.shape)

# print(df)

# print(count_vect.vocabulary_) # returns dictionary where each word key has an document index value(not count)

# print(count_vect.get_feature_names_out().tolist()) # returns list of words


As you can see there is a lot of non-words and words in other languages, clean this up by using max_df(exclude words occuring above threshold) and min_df(exclude words occuring below threshold), the idea is words that occur often (as, is, the)are unimportant and words occuring very little are of no value for learning.

TF-IDF is better than Count Vectorizers because it not only focuses on the frequency of words present in the corpus but also provides the importance of the words. We can then remove the words that are less important for analysis, hence making the model building less complex by reducing the input dimensions.

In [None]:
# Use TfidfTransformer on BoW created by CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

In [None]:
#
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB().fit(X_train_tf,y_train)
X_new_counts = count_vect.transform(X_test)
predicted = clf.predict(X_new_counts)

print("classifier with count vectorizer only:")
print(metrics.classification_report(y_test, predicted))

X_new_tfidf = tf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

print("classifier with count vectorizer and tfidf transformer:")
print(metrics.classification_report(y_test, predicted))

As tf–idf is very often used for text features, there is also another class called TfidfVectorizer that combines all the options of CountVectorizer and TfidfTransformer in a single model:

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_tfidf = vectorizer.transform(X_test)

predicted = clf.predict(X_test_tfidf)

print("classifier with tfidfVectorizer:")
print(metrics.classification_report(y_test, predicted))

## BUILDING A PIPELINE

In [None]:
from sklearn.pipeline import Pipeline

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))


## TRYING other CLASSIFIERs

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
# try with TfidfVectorizer instead of CountVectorizer and TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text_clf_pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', KNeighborsClassifier()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

text_clf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

So far seems best model is SGDClassifier. Lets try it on full dataset

In [None]:
X = data["review"]
y = data["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

text_clf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier()),
])

text_clf_pipeline.fit(X_train, y_train)
predicted = text_clf_pipeline.predict(X_test)

print(metrics.classification_report(y_test, predicted))
print(metrics.confusion_matrix(y_test, predicted))

Great seems the SGD model performed better on full dataset.

Lets predict the sentiment now from some unseen reviews

In [None]:
review1 = pd.Series(["LOVED IT! This movie was amazing. Top 10 this year."])
prediction = text_clf_pipeline.predict(review1)
print(f"Review: {review1.values}")
print(f"Sentiment: {prediction}")

In [None]:
review2 = pd.Series(["Total junk! I'll never watch a film by that director again, no matter how good the reviews."])
prediction = text_clf_pipeline.predict(review2)
print(f"Review: {review2.values}")
print(f"Sentiment: {prediction}")

In [None]:
# 60%-20%-20% for train-validation-test

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

# another method
# train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])