# Document Classification

**Import needed libraries**

In [28]:
import numpy as np
import pandas as pd
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

**Classification**

In [2]:
# Read in the locally saved file from the link above
df_yelp = pd.read_csv('data/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


**split dataset**

- doing before vectorization to avoid leaking data

In [3]:
# Create the feature and target variables
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

**Vectorizing**

In [4]:
# Instantiate and fit the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))
vectorizer.fit(sentences_train)

# Vectorize the training and testing data
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

# Display the properties of the vectorized text
X_train

<750x2864 sparse matrix of type '<class 'numpy.float64'>'
	with 3051 stored elements in Compressed Sparse Row format>

**Baseline classification score**

In [5]:
# Instantiate and fit a model
classifier = LogisticRegression(solver='lbfgs')

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.588


**Creating a two step Pipline for cross fold validation**

- Vectorizer
- Classifier

In [6]:
# Define the Pipeline
pipe = Pipeline([('vect', vectorizer), # vectorizer
                 ('clf', classifier) # classifier
                ])

# Define the parameter space for the grid serach
parameters = {'clf__C': [1, 10, 1000000]} # C: regularization strength

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Print out the best score
grid_search.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


0.611

**Pipline with random forest**

In [7]:
# Instantiate and fit a model
classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.56


In [16]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 4]
Method of selecting samples for training each tree
bootstrap = [True, False]



# Define the Pipeline
pipe = Pipeline([('vect', vectorizer), # vectorizer
                 ('clf', classifier) # classifier
                ])

# Define the parameter space for the grid serach
# you need the 'clf' name followed by two __ then the parameter 'n_estimators'
parameters = {'clf__n_estimators': n_estimators,
              'clf__max_features': max_features,
              'clf__max_depth': max_depth,
              'clf__min_samples_split': min_samples_split,
              'clf__min_samples_leaf': min_samples_leaf,
              'clf__bootstrap': bootstrap}


grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Print out the best score
grid_search.best_score_

Fitting 5 folds for each of 40 candidates, totalling 200 fits


0.591

**Singular Value Decomposition (SVD)**

In [22]:
df_yelp = pd.read_csv('data/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Instantiate the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Instantiate the LSA (SVD) algorithm (defaults)
svd = TruncatedSVD()

In [26]:
# LSA part
lsa = Pipeline([('vect', vectorizer), ('svd', svd)])

# Combine into one pipeline
pipe = Pipeline([('lsa', lsa), ('clf', classifier)])

# Define the parameter space for the grid search
parameters = {
    'lsa__svd__n_components': (100,250),
    'lsa__vect__max_df': (0.9, 1.0), # max document frequency
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Display the best score from the grid-search
grid_search.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


0.5999999999999999

**SVD with amazon dataset**

In [27]:
df_yelp = pd.read_csv('data/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Instantiate the tf-idf vectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (2,2))

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Instantiate the LSA (SVD) algorithm (defaults)
svd = TruncatedSVD()

# LSA part
lsa = Pipeline([('vect', vectorizer), ('svd', svd)])

# Combine into one pipeline
pipe = Pipeline([('lsa', lsa), ('clf', classifier)])

# Define the parameter space for the grid search
parameters = {
    'lsa__svd__n_components': (100,250),
    'lsa__vect__max_df': (0.9, 1.0), # max document frequency
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(sentences, y);

# Display the best score from the grid-search
grid_search.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


0.643

**Pipeline spacy word embeddings**

In [30]:
nlp = spacy.load("en_core_web_lg")

# Read in the locally saved file from UCI website
df_yelp = pd.read_csv('data/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

# Function to return the vector for each sentence in a document
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

# Get the vectors for each sentence (mean of all the word vectors)
X_train = get_word_vectors(sentences_train)
X_test = get_word_vectors(sentences_test)

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Fit the model
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

# Print out the accuracy score
print("Accuracy including word embeddings: ", score)

Accuracy including word embeddings:  0.856


**amazon data**

In [31]:
nlp = spacy.load("en_core_web_lg")

# Read in the locally saved file from UCI website
df_yelp = pd.read_csv('data/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
df_yelp.head()

# Create the features and target
sentences = df_yelp['sentence']
y = df_yelp['label']

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=42)

# Function to return the vector for each sentence in a document
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

# Get the vectors for each sentence (mean of all the word vectors)
X_train = get_word_vectors(sentences_train)
X_test = get_word_vectors(sentences_test)

# Instantiate the classifier (defaults)
classifier = LogisticRegression(solver='lbfgs')

# Fit the model
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

# Print out the accuracy score
print("Accuracy including word embeddings: ", score)

Accuracy including word embeddings:  0.86
