In [1]:
# Let's use some of our newfound knowledge with IMDB reviews and their overall sentiments

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Sweet, nltk is already present in our system
import nltk

In [2]:
# Example of how nltk can help us tokenize words

from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

#function to split text into word
tokens = word_tokenize("The quick brown fox jumps over the lazy dog")
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jonny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jonny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# We can even tokenize the strange words in the magic dataset.
# However, it seems that we still have the punctuation. We deal with this.
tokens = word_tokenize("Put a +1/+1 counter on target creature, then draw a card.")
print(tokens)

['Put', 'a', '+1/+1', 'counter', 'on', 'target', 'creature', ',', 'then', 'draw', 'a', 'card', '.']


In [4]:
# Dealing with common words in english

import string
from nltk.corpus import stopwords
# Define the common, useless (fluff) words in English language
stop_words = set(stopwords.words('english'))

# Select all the important english words, then remove all the punctuation
tokens = word_tokenize("Put a +1/+1 counter on target creature, then draw a card.")
tokens = [w for w in tokens if not w in stop_words]
tokens = list(filter(lambda token: token not in string.punctuation, tokens))
print(tokens)

['Put', '+1/+1', 'counter', 'target', 'creature', 'draw', 'card']


In [5]:
# Load the IMDB dataset

df = pd.read_csv('Datacamp CSV/IMDB Dataset.csv')

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
# What are the top 50 words used in the review dataset?

# Combine all the reviews into a single text. NB: The dataset is massive, so we'll only use
# around 2000 samples.

df = df.iloc[:5000]

# Around 50-50 good and bad reviews
print(df.sentiment.value_counts())

negative    2532
positive    2468
Name: sentiment, dtype: int64

In [18]:
reviews = df.review.str.cat(sep = ' ')

tokens = word_tokenize(reviews)
tokens = [w.lower() for w in tokens]
tokens = [w for w in tokens if not w in stop_words]
tokens = list(filter(lambda token: token not in string.punctuation, tokens))

# There are 57k different 'words' used in the review. However, some of those are likely not
# real words. If we remove the useless words and the punctuation, we remove around 
vocabulary = set(tokens)
print(len(vocabulary))

49451


In [19]:
# It looks like a majority of the words are actually punctuations, plus useless words like 'a',
# or 'and' or 'the'
frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)[0:50]

# After some pre-processing, it looks like we've removed most of the faff. 

['br',
 "'s",
 'movie',
 'film',
 "''",
 '``',
 "n't",
 'one',
 'like',
 'good',
 'would',
 'even',
 '...',
 'see',
 'time',
 'story',
 'really',
 'well',
 'much',
 'get',
 'bad',
 'could',
 'people',
 'first',
 'also',
 'great',
 'make',
 'way',
 'made',
 'movies',
 'characters',
 'watch',
 'think',
 'character',
 'never',
 'little',
 'films',
 'many',
 'seen',
 'plot',
 'two',
 'acting',
 'best',
 'know',
 'show',
 'love',
 'ever',
 'life',
 'scene',
 'better']

In [65]:
# Let's do some pipelining

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import PolynomialFeatures

# Create our dataset. NB: Witholding 1000 samples for future testing.
X = df.iloc[:4000, 0]
y = df.iloc[:4000, 1]

# Separate into train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,
                                                    test_size = 0.2, 
                                                    random_state = 42)

alphanumeric = '[A-Za-z0-9]+(?=\\s+)'

# Create our pipeline. 
pipeline = Pipeline([('vectorizer', CountVectorizer(token_pattern = alphanumeric, 
                                                    stop_words = 'english')), 
                     ('transformer', TfidfTransformer()), 
                     # Remember that polynomialfeatures takes AGES, and requires a MASSIVE
                     # amount of code.
                     # ('interaction', PolynomialFeatures(degree = 2)),
                     # ('dim_red', TruncatedSVD()),
                     # ('clf', GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 9))
                     # ('clf', KNeighborsClassifier(n_neighbors = 15))
                     # ('clf', RandomForestClassifier(max_features = 'log2', n_estimators = 1000))
                     ('clf', LogisticRegression(C = 100, solver = 'newton-cg'))
                     # ('clf', MultinomialNB(alpha = 0.9, fit_prior = False))
                    ])

pipeline.fit(X_train, y_train)

# Sweet, even just using a very simple Logistic Regression tool, we have managed to achieve 
# an 82.8% accuracy on Sentiment Analysis, with a 0.484 log_loss. 

# With tuned hyperparameters, its gone up to 83.6% accuracy, 0.370 log_loss

print(pipeline.score(X_test, y_test))

# For reference, in a binary classification with 50-50 odds of either class, the log_loss
# to beat is around 0.693.
y_pred_probs = pipeline.predict_proba(X_test)
print(log_loss(y_test, y_pred_probs))

0.83625
0.3702975147147943


In [37]:
# How to fit new data to our model?
X_holdout = df.iloc[4000:, 0]
y_holdout = df.iloc[4000:, 1]

# Pretty simple!
from sklearn.metrics import accuracy_score
y_pred = pipeline.predict_proba(X_holdout)
print(log_loss(y_holdout, y_pred))

0.3958225761121344


In [57]:
# GridSearchCV with the pipeline
from sklearn.model_selection import GridSearchCV

# Decent Resource for Hyperparameter tuning
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/


# Parameters of pipelines can be set using ‘__’ separated parameter names:
# param_grid = {
#     'clf__C': [100, 10, 1.0, 0.1, 0.01],
#     'clf__solver': ['newton-cg', 'lbfgs', 'liblinear'],
#     'clf__penalty': ['l2']
# }

# param_grid = {
#     'clf__n_neighbors': [5, 10, 15, 20]
# }

# param_grid = {
#     'clf__max_features': ['sqrt', 'log2'],
#     'clf__n_estimators': [10, 100, 1000]
# }

# param_grid = {
#     'clf__n_estimators': [10, 100],
#     'clf__learning_rate': [0.001, 0.01, 0.1],
#     'clf__max_depth': [3, 7, 9]
# }

param_grid = {
    'clf__alpha': np.linspace(0.5, 1.5, 6),
    'clf__fit_prior': [True, False]
}

search = GridSearchCV(pipeline, param_grid, cv = 3, n_jobs=-1, verbose = 1)
search.fit(X_train, y_train)
print("Best parameter CV score={:.3f}:".format(search.best_score_))
print(search.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   18.2s finished


Best parameter CV score=0.808:
{'clf__alpha': 0.9, 'clf__fit_prior': False}


In [None]:
# Testing different models

# We can try several different models to see how they perform on the data. 

# Tuned LogisticRegression --> 83.6% acc, 0.370 log_loss
# GradientBoosting --> 78.1% acc, 0.500 log_loss
# Tuned GradientBoosting --> 78.1% acc, 0.472 log_loss
# Tuned KNN --> 68.1% acc, 0.604 log_loss
# Tuned RandomForest --> 83.1% acc, 0.577 log_loss
# Tuned Multinomial NB --> 80.5% acc, 0.509 log_loss

# Interesting to note that the randomforest has a high accuracy, but also has a bad log_loss
# It looks like LogisticRegression is still the best!