In [17]:
import pandas as pd

data = pd.read_csv('sentence_classification_data.csv', header=0)
data = data.dropna()
X = data[['Sentence', 'Gram']]
y = data['Type']

Separate the train and test datasets

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state = 1)


Load Google's Universal Sentence Encoder v4. The TFHUB_CACHE_DIR enviromental variable saves the module to a location in the CWD, so you don't have to download a 1GB file every time. This will take a bit the first time you run this cell.

In [19]:
import os
import tensorflow_hub as hub
os.environ['TFHUB_CACHE_DIR'] = './tf_cache'

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

This is literally just a copy and paste of the clean_data file. We'll use full_clean() later.

In [20]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import download
from nltk import word_tokenize
from bs4 import BeautifulSoup
import re
import string
import unicodedata
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download("wordnet")


'''All of these are functions to normalize the data'''

def remove_between_square_brackets(text):
    '''uses regex to remove all of the square brackets since the result is a list'''
    return re.sub('\[[^]]*\]', '', text)


def tokenize(text):
    words = nltk.word_tokenize(text) # puts all of the words into a list
    return (words)



def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    dirty_characters = "!@#$%^&*()[]{};:,./<>?\|`~-=_+"
    for word in words:
        if (word not in dirty_characters) and (word != "''" and word != "``") and ("'" not in word):
            new_words.append(word)
    return new_words


def remove_stopwords(words):
    stop = set(stopwords.words('english'))
    with open("common.txt") as file:
        common_words = set(file.read().split('\n'))
    stop = stop|common_words
    new_words = []
    for word in words:
        if not word in stop:
            new_words.append(word)
    return new_words

def lemmatize(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    new_words = []
    for word in words:
        new_words.append(lemmatizer.lemmatize(word)) # lemmatizes the words in the list and adds them to the new list
    return new_words



def normalize(words):
    '''Applies all previous functions to isolate nonuseful data'''
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = lemmatize(words)
    return words

def full_clean(corpus):
    clean_corpus = []
    for c in corpus:
        words = remove_between_square_brackets(c)
        words = tokenize(words)
        words = normalize(words)
        clean_corpus.append(' '.join(words))
    return clean_corpus


Normalizeing the data

In [21]:
# Noramlization time!
sentences = X_train['Sentence']
grams = X_train['Gram']

cleaned_sentences = full_clean(sentences)

Creates an array of features, with each feature in the layout: [phrase, sentence]. Feature engineering via part of speech tagging.

In [40]:
# time to build features :)
from nltk import pos_tag

features = []

for i in range(len(sentences)):
    phrase = grams.iloc[i]
    
    phrase_parts_of_speech = pos_tag(phrase.split(" "))
    phrase_only_pos = []
    for p in phrase_parts_of_speech:
        phrase_only_pos.append(p[1])
    
    sentence = cleaned_sentences[i]
    sentence_parts_of_speech = pos_tag(sentence.split(" "))
    sentence_only_pos = []
    for p in sentence_parts_of_speech:
        sentence_only_pos.append(p[1])
    
    phrase_only_pos = ' '.join(phrase_only_pos)
    sentence_only_pos = ' '.join(sentence_only_pos)
    
    features.append([phrase, sentence, phrase_only_pos, sentence_only_pos])


['united', 'sanction venezuela venezuela country introduced sanction european union-countries collectively introduced sanction non-eu european country aligned eu sanction country introducing entry ban maduro government official country introduced sanction venezuela response outgoing crisis venezuela crisis venezuela government united state european union canada mexico panama switzerland applied individual sanction associated administration nicolas maduro', 'JJ', 'NN FW JJ NN VBD JJ JJ NNS RB VBN NN JJ JJ NN VBD JJ NN NN VBG NN NN JJ NN NN NN VBD NN NN NN VBG NN NN NN JJ NN VBD NN JJ NN NN VBP NN NN VBD JJ NN VBN NN NNS VBP']


Encode the strings in each feature w/ USE, and encode the labels in our y.

In [41]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_enc = LabelEncoder()

embed_features = []

for f in features:
    gram = f[0]
    sentence = f[1]
    phrase_pos = f[2]
    sentence_pos = f[3]
    
    embeddings = embed([gram, sentence, phrase_pos, sentence_pos])
    new = np.array([embeddings[0], embeddings[1], embeddings[2], embeddings[3]])
    embed_features.append(new.flatten()) # We have to flatten because sklearn only supports 1D features :(

y_train_encoded = label_enc.fit_transform(y_train)
print(embed_features)
print(y_train_encoded)

[array([ 0.01449664,  0.02201575,  0.02600318, ...,  0.05167248,
        0.0125527 , -0.06309862], dtype=float32), array([-0.04417213, -0.07319518,  0.0011199 , ...,  0.05493757,
        0.00906134, -0.05537884], dtype=float32), array([-0.05097289, -0.05394271, -0.03895493, ..., -0.00904782,
       -0.05985049, -0.06387902], dtype=float32), array([ 0.06932335, -0.06855359, -0.01106318, ...,  0.05095861,
        0.03881134, -0.06195608], dtype=float32), array([-0.04513631, -0.07369344,  0.05026276, ...,  0.05375683,
       -0.0074142 , -0.06578529], dtype=float32), array([-0.00479155, -0.01222475, -0.03512248, ...,  0.05358936,
        0.04790596, -0.04188489], dtype=float32), array([ 0.03843361, -0.0717413 ,  0.05358797, ...,  0.05802584,
        0.05536174, -0.069147  ], dtype=float32), array([ 0.03607225,  0.06999975, -0.07253078, ...,  0.03699044,
        0.04228488, -0.06543402], dtype=float32), array([ 0.03607225,  0.06999975, -0.07253078, ...,  0.05665174,
        0.00404097, -0.

Train the classifier.

In [42]:
# Ayyy it's classifier time
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=1)

clf.fit(embed_features, y_train_encoded)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

Preprocess the validation data the same as we did the training data, and then get percent accuracy.

In [34]:
# Finna validate this bitch
from sklearn.metrics import accuracy_score

test_features = []
test_sentences = full_clean(X_test['Sentence'])

for i in range(len(X_test['Sentence'])):
    phrase = X_test['Gram'].iloc[i]
    sentence = test_sentences[i]
    
    
    phrase_parts_of_speech = pos_tag(phrase.split(" "))
    phrase_only_pos = []
    for p in phrase_parts_of_speech:
        phrase_only_pos.append(p[1])
        
    
    sentence_parts_of_speech = pos_tag(sentence.split(" "))
    sentence_only_pos = []
    for p in sentence_parts_of_speech:
        sentence_only_pos.append(p[1])
    
    phrase_only_pos = ' '.join(phrase_only_pos)
    sentence_only_pos = ' '.join(sentence_only_pos)
    
    test_features.append([phrase, sentence, phrase_only_pos, sentence_only_pos])

embed_test_features = []


for f in test_features:
    gram = f[0]
    sentence = f[1]
    phrase_pos = f[2]
    sentence_pos = f[3]
    embeddings = embed([gram, sentence, phrase_pos, sentence_pos])
    new = np.array([embeddings[0], embeddings[1], embeddings[2], embeddings[3]])
    embed_test_features.append(new.flatten())
    
y_test_encoded = label_enc.transform(y_test)

preds = clf.predict(embed_test_features)

acc = accuracy_score(y_test_encoded, preds)

print("ACCURACY:")
print(acc)

ACCURACY:
0.8125
