In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import re
import string
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Exploration and Processing

In [4]:
data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
data.head()


In [5]:
data.shape

In [6]:
plt.figure(figsize = (10,6))
plt.title("Data Distribution")
sns.countplot(x = "sentiment", data = data)

plt.show()

In [7]:
data['lenght'] = data['review'].apply(lambda x : len(x.split()))

In [8]:
plt.figure(figsize = (10,6))
plt.title("Text lenght")

sns.histplot(x="lenght", data = data)
plt.show()

In [9]:
fig = plt.figure(figsize = (14,8))
ax1 = fig.add_subplot(121)
plt.title("Positive text lenght")
sns.histplot(x="lenght", data = data[data['sentiment'] == 'positive'], ax=ax1)

ax2 = fig.add_subplot(122)
plt.title("Negative text lenght")
sns.histplot(x="lenght", data = data[data['sentiment'] == 'negative'], ax=ax2)

plt.show()

In [10]:
# Example
data.iloc[1,0]

In [13]:
def clean_review(text):
    clean_text = re.sub('<br\s?\/>|<br>', '', text) 
    clean_text = re.sub('[^a-zA-Z\']', ' ', clean_text)
    clean_text = clean_text.lower()
    return clean_text

In [14]:
data['review'] = data['review'].apply(lambda x : clean_review(x))
data.iloc[1,0]

## Modelling
### Bag Of Words

In [16]:
# Splitting the data into train set and validation set
train_data = data[:30000]
val_data = data[30000:]

For efficient processing we will disable some pipline component like parser and named entity recognition

In [17]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create custom TextCategorizer with exclusive classes and bag of words architecture
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)
print(nlp.pipe_names)
textcat.add_label("positive")
textcat.add_label("negative")

In [18]:
# Data Preparation
train_texts = train_data['review'].values
train_labels = [{'cats': {'positive': label == 'positive','negative': label == 'negative'}} 
                for label in train_data['sentiment']]

In [19]:
from spacy.util import minibatch
import random

def model_train(model, train, optimizer):
    losses = {}
    random.seed(1)
    random.shuffle(train)
    
    batches = minibatch(train, size=8)
    for batch in batches:
        texts, labels = zip(*batch)
        model.update(texts, labels, sgd=optimizer, losses=losses)
        
    return losses

In [20]:
# 1st Iteration
spacy.util.fix_random_seed(1)
random.seed(1)
optimizer = nlp.begin_training()
train = list(zip(train_texts, train_labels))
losses = model_train(nlp, train, optimizer)
print(losses['textcat'])

Prediction:

In [21]:
data.iloc[30001,:2]

In [22]:
doc = nlp(data.iloc[30001,0])
print(doc.cats)

In [23]:
# Predict list of reviews
def predict(nlp, texts): 

    docs = [nlp.tokenizer(text) for text in texts]    
    # Use textcat to get the scores for each doc
    textcat = nlp.get_pipe('textcat')
    predicted_class = textcat.predict(docs)[0].argmin(axis=1)
    
    return predicted_class

In [24]:
data.iloc[30001:30004,:2]

In [25]:
# Negative review -> 0; positive review -> 1
predict(nlp, list(data.iloc[30001:30004,0].values))

In [26]:
from sklearn.metrics import accuracy_score, f1_score
mapper = {'positive':1, 'negative':0}
val_data['sentiment'] = val_data['sentiment'].apply(lambda x : mapper[x])
val_data.sentiment.values

In [27]:
def evaluate(model, texts, labels): 
    predicted_class = predict(model, texts)
    accuracy = accuracy_score(predicted_class, labels)
    fscore = f1_score(predicted_class, labels)
    return accuracy, fscore

In [28]:
accuracy, f1score = evaluate(nlp, list(val_data.review.values), val_data.sentiment.values)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1_score: {f1score:.4f}")

In [29]:
# Model Training 
n_iters = 6
for i in range(n_iters):
    losses = model_train(nlp, train, optimizer)
    accuracy, f1score = evaluate(nlp, list(val_data.review.values), val_data.sentiment.values)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f} \t F1_Score: {f1score:.3f}")

In [30]:
from wordcloud import WordCloud
from nltk.corpus import stopwords
stp = set(stopwords.words('english'))

WordCloud of positive reviews, It is better to add some words to our stopwords set such as: movie, film, story ... because such words may appear in both positive and negative reviews.

In [32]:
poswords = ' '.join([text for text in train_data[train_data['sentiment'] == 'positive']['review']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, stopwords=stp,
                      background_color='white').generate(poswords)

plt.figure(figsize=(8, 5))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### CNN

In [33]:
del nlp, textcat, optimizer

In [34]:
nlp = spacy.blank("en")

# Create custom TextCategorizer with exclusive classes and cnn architecture
textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(textcat)
print(nlp.pipe_names)
textcat.add_label("positive")
textcat.add_label("negative")

Train a CNN architecture needs many resources (RAM), I will use a small subset of data.

In [35]:
def load_data(data):
    # Splitting the data into train set and validation set
    train_data = data[:10000]
    val_data = data[10000:13000]
    mapper = {'positive':1, 'negative':0}
    val_data['sentiment'] = val_data['sentiment'].apply(lambda x : mapper[x])
    train_texts = train_data['review'].values
    train_labels = [{'cats': {'positive': label == 'positive','negative': label == 'negative'}} 
                for label in train_data['sentiment']]
    return list(zip(train_texts, train_labels)), list(val_data.review.values), val_data.sentiment.values

In [36]:
n_iters = 1
spacy.util.fix_random_seed(1)
random.seed(1)
optimizer = nlp.begin_training()
train, valrev, valsnt = load_data(data)
for i in range(n_iters):
    losses = model_train(nlp, train, optimizer)
    accuracy, f1score = evaluate(nlp, valrev, valsnt)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f} \t F1_Score: {f1score:.3f}")