In [1]:
# This code cell is to get rid of annoying tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
# from sklearn.naive_bayes import MultinomialNB
import tensorflow as tf

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
y_train = df_train["target"]

In [6]:
data_train = df_train["text"].to_numpy()
data_test = df_test["text"].to_numpy()

In [7]:
data_train

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [8]:
tokenizer = RegexpTokenizer("\w+")
sw = set(stopwords.words("english"))
ps = PorterStemmer()

In [9]:
def cleanSentence(sent):
    sent = sent.lower()
    words = tokenizer.tokenize(sent)
    cleaned_sent_arr = [w for w in words if w not in sw]
    stemmed_words = [ps.stem(token) for token in cleaned_sent_arr]
    cleaned_sent = " ".join(stemmed_words)
    return cleaned_sent

In [10]:
def getDoc(document):
    d = []
    for doc in document:
        d.append(cleanSentence(doc))
    return np.array(d)

In [11]:
cleaned_data_train = getDoc(data_train)
cleaned_data_test = getDoc(data_test)

In [12]:
data_train

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [13]:
cleaned_data_train

array(['deed reason earthquak may allah forgiv us',
       'forest fire near la rong sask canada',
       'resid ask shelter place notifi offic evacu shelter place order expect',
       ..., 'm1 94 01 04 utc 5km volcano hawaii http co zdtoyd8ebj',
       'polic investig e bike collid car littl portug e bike rider suffer seriou non life threaten injuri',
       'latest home raze northern california wildfir abc news http co ymy4rskq3d'],
      dtype='<U129')

In [14]:
cv = CountVectorizer()

In [15]:
X_train = cv.fit_transform(cleaned_data_train).toarray()
X_test = cv.transform(cleaned_data_test).toarray()

In [16]:
# model = MultinomialNB()

model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(2, activation='softmax')
])

In [17]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f83e403e0a0>

In [19]:
X_train.shape, y_train.shape

((7613, 18526), (7613,))

In [27]:
ans = np.argmax(model.predict(X_test), 1)



In [29]:
df_ans = pd.DataFrame({"id": df_test["id"], "target": ans})

In [30]:
df_ans.to_csv("outputs/ans7.csv", index=None)