In [1]:
import pandas as pd

data = pd.read_csv('sentence_classification_data.csv', header=0)
data = data.dropna()
X = data[['Sentence', 'Gram']]
y = data['Type']

Separate the train and test datasets

In [2]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state = 1)


Load Google's Universal Sentence Encoder v4. The TFHUB_CACHE_DIR enviromental variable saves the module to a location in the CWD, so you don't have to download a 1GB file every time. This will take a bit the first time you run this cell.

In [3]:
import os
import tensorflow_hub as hub
os.environ['TFHUB_CACHE_DIR'] = './tf_cache'

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

This is literally just a copy and paste of the clean_data file. We'll use full_clean() later.

In [4]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import download
from nltk import word_tokenize
from bs4 import BeautifulSoup
import re
import string
import unicodedata
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download("wordnet")


'''All of these are functions to normalize the data'''

def remove_between_square_brackets(text):
    '''uses regex to remove all of the square brackets since the result is a list'''
    return re.sub('\[[^]]*\]', '', text)


def tokenize(text):
    words = nltk.word_tokenize(text) # puts all of the words into a list
    return (words)



def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    dirty_characters = "!@#$%^&*()[]{};:,./<>?\|`~-=_+"
    for word in words:
        if (word not in dirty_characters) and (word != "''" and word != "``") and ("'" not in word):
            new_words.append(word)
    return new_words


def remove_stopwords(words):
    stop = set(stopwords.words('english'))
    with open("common.txt") as file:
        common_words = set(file.read().split('\n'))
    stop = stop|common_words
    new_words = []
    for word in words:
        if not word in stop:
            new_words.append(word)
    return new_words

def lemmatize(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    new_words = []
    for word in words:
        new_words.append(lemmatizer.lemmatize(word)) # lemmatizes the words in the list and adds them to the new list
    return new_words



def normalize(words):
    '''Applies all previous functions to isolate nonuseful data'''
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = lemmatize(words)
    return words

def full_clean(corpus):
    clean_corpus = []
    for c in corpus:
        words = remove_between_square_brackets(c)
        words = tokenize(words)
        words = normalize(words)
        clean_corpus.append(' '.join(words))
    return clean_corpus


Normalizeing the data

In [5]:
# Noramlization time!
sentences = X_train['Sentence']
grams = X_train['Gram']

cleaned_sentences = full_clean(sentences)

Creates an array of features, with each feature in the layout: [phrase, sentence]

In [6]:
# time to build features :)
features = []

for i in range(len(sentences)):
    features.append([grams.iloc[i], cleaned_sentences[i]])

Encode the strings in each feature w/ USE, and encode the labels in our y.

In [14]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_enc = LabelEncoder()

embed_features = []

for f in features:
    gram = f[0]
    sentence = f[1]
    embeddings = embed([gram, sentence])
    new = np.array([embeddings[0], embeddings[1]])
#    new = np.array([embed([gram])[0], embed([sentence])[0]])

    embed_features.append(new.flatten()) # We have to flatten because sklearn only supports 1D features :(

y_train_encoded = label_enc.fit_transform(y_train)
print(embed_features)
print(y_train_encoded)

[array([ 0.01449663,  0.02201574,  0.02600321, ..., -0.03402658,
       -0.06647635, -0.0704798 ], dtype=float32), array([-0.04417212, -0.0731952 ,  0.00111988, ...,  0.03333268,
       -0.05691196, -0.05793458], dtype=float32), array([-0.05097286, -0.05394271, -0.03895494, ...,  0.06551553,
       -0.06181725,  0.04572295], dtype=float32), array([ 0.06932335, -0.0685536 , -0.01106319, ...,  0.07058391,
       -0.04423688, -0.00763032], dtype=float32), array([-0.04513633, -0.07369342,  0.05026278, ...,  0.02537297,
       -0.00067364, -0.01929236], dtype=float32), array([-0.0047916 , -0.01222474, -0.03512248, ...,  0.05273103,
       -0.06542797, -0.07201196], dtype=float32), array([ 0.03843356, -0.0717413 ,  0.05358797, ...,  0.00788131,
       -0.03395406, -0.00507998], dtype=float32), array([ 0.03607225,  0.06999976, -0.07253077, ...,  0.04531831,
       -0.03115256,  0.02231706], dtype=float32), array([ 0.03607225,  0.06999976, -0.07253077, ...,  0.06139121,
        0.01927412, -0.

Train the classifier.

In [15]:
# Ayyy it's classifier time
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=1)

clf.fit(embed_features, y_train_encoded)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

Preprocess the validation data the same as we did the training data, and then get percent accuracy.

In [16]:
# Finna validate this bitch
from sklearn.metrics import accuracy_score

test_features = []
test_sentences = full_clean(X_test['Sentence'])

for i in range(len(X_test['Sentence'])):
    test_features.append([X_test['Gram'].iloc[i], test_sentences[i]])

embed_test_features = []

for f in test_features:
    gram = f[0]
    sentence = f[1]
    embeddings = embed([gram, sentence])
    new = np.array([embeddings[0], embeddings[1]])
    embed_test_features.append(new.flatten())
    
y_test_encoded = label_enc.transform(y_test)

preds = clf.predict(embed_test_features)

acc = accuracy_score(y_test_encoded, preds)

print("ACCURACY:")
print(acc)

ACCURACY:
0.8125
