In [1]:
import spacy
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re
from tqdm import tqdm

In [2]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train, val = train_test_split(data, test_size=0.2)

In [6]:
import string
# From geeksforgeeks.org
# remove punctuation 
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

In [7]:
#Remove Numbers
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 
  
input_str = "13000 people receive wildfires evacuation orders in california "
remove_numbers(input_str) 

' people receive wildfires evacuation orders in california '

In [8]:
#Remove English Stopwords
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import nltk
nltk.download("stopwords")

def remove_stopwords(text): 
    stop_words = set(stopwords.words("english"))
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    filtered_text =  re_stop_words.sub(" ", text)
    return filtered_text 
  
example_text = "raining flooding florida tampabay tampa 18 or 19 days ive lost count "
remove_stopwords(example_text) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tresyap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'raining flooding florida tampabay tampa 18  19 days ive lost count '

In [9]:
# Stemmer
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer() 
  
# stem words in the list of tokenised words 
def stem_words(text): 
    stems = [stemmer.stem(word) for word in text.split()] 
    return " ".join(stems)
  
text = "raining flooding florida tampabay tampa 18 or 19 days ive lost count "
stem_words(text) 

'rain flood florida tampabay tampa 18 or 19 day ive lost count'

In [10]:
def preprocess(df):
    try:
        df.drop(['keyword', 'location'], axis=1, inplace=True)
    except:
        print("columns already dropped")
    df.text = df.text.apply(remove_punctuation)
    df.text = df.text.apply(remove_numbers)
    df.text = df.text.apply(remove_stopwords)
    df.text = df.text.apply(stem_words)
    return df

In [11]:
train = preprocess(train)
val = preprocess(val)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [12]:
train_texts = train.text.values
train_labels = [{'cats': {'0': label == 0,
                          '1': label == 1}}
                for label in train.target]

In [13]:
len(train_texts)

6090

In [14]:
len(train_labels)

6090

In [15]:
train_data = list(zip(train_texts, train_labels))

In [20]:
def train_spacy(train_data, epochs=10, verbose=0, batch_size=32, archi="ensemble"):
    nlp = spacy.blank("en")

    #default architecture
    textcat = nlp.create_pipe(
                    "textcat",
                    config={
                        "exclusive_classes": True,
                        "architecture": archi
                    })

    nlp.add_pipe(textcat)

    textcat.add_label("0")
    textcat.add_label("1")

    spacy.util.fix_random_seed(1)
    optimizer = nlp.begin_training()

    losses = {}
    for epoch in range(epochs):
        #shuffle data to improve performance

        random.shuffle(train_data)
        #training loop
        for batch in (spacy.util.minibatch(train_data, size=batch_size)):
            texts, labels = zip(*batch)
            nlp.update(texts, labels, sgd=optimizer, losses=losses)
        if verbose == 1:
            print(losses)
    return textcat

In [21]:
def model_eval(epochs=10, verbose=0, batch_size=32, archi="ensemble"):
    nlp = spacy.blank("en")

    textcat = train_spacy(train_data, epochs=epochs, verbose=verbose, batch_size=batch_size, archi=archi)

    test_docs = [nlp.tokenizer(text) for text in val.text]

    scores, _ = textcat.predict(test_docs)

    predicted_labels = scores.argmax(axis=1)
    y_val = val.target

    print(f1_score(y_val, predicted_labels))

In [24]:
# model_eval(10, archi="bow")
model_eval(20, archi="bow")
model_eval(30, archi="bow")

0.7479935794542536
0.7371794871794872


In [25]:
test = preprocess(test)

In [26]:
nlp = spacy.blank("en")
textcat = train_spacy(train_data, 10, archi="bow")
test_docs = [nlp.tokenizer(text) for text in test.text]
scores, _ = textcat.predict(test_docs)
pred = scores.argmax(axis=1)

In [27]:
submission = pd.DataFrame({'id': test.id, 'target': pred})
submission.to_csv('disaster_tweet_spacy.csv', index=False)