In [1]:
# This code cell is to get rid of annoying tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
train_df.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [5]:
X_train_df_raw = train_df.drop(["id", "keyword", "location", "target"], axis=1)
X_test_df_raw = test_df.drop(["id", "keyword", "location"], axis=1)

In [6]:
X_train_df_raw.head()

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...


In [7]:
X_test_df_raw.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [8]:
y = train_df.target.to_numpy()

In [9]:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words("english")
STOPWORDS[:3]

['i', 'me', 'my']

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def text_process(sentence):
    sentence = "".join([char for char in sentence if char not in string.punctuation])
    clean_words = [w for w in sentence.split() if w not in STOPWORDS]
    return " ".join(clean_words)

In [12]:
X_train_df = X_train_df_raw.text.apply(text_process)

In [13]:
X_test_df = X_test_df_raw.text.apply(text_process)

In [14]:
X_train_df

0        Our Deeds Reason earthquake May ALLAH Forgive us
1                   Forest fire near La Ronge Sask Canada
2       All residents asked shelter place notified off...
3       13000 people receive wildfires evacuation orde...
4       Just got sent photo Ruby Alaska smoke wildfire...
                              ...                        
7608    Two giant cranes holding bridge collapse nearb...
7609    ariaahrary TheTawniest The control wild fires ...
7610    M194 0104 UTC5km S Volcano Hawaii httptcozDtoy...
7611    Police investigating ebike collided car Little...
7612    The Latest More Homes Razed Northern Californi...
Name: text, Length: 7613, dtype: object

In [15]:
X_test_df

0                        Just happened terrible car crash
1       Heard earthquake different cities stay safe ev...
2       forest fire spot pond geese fleeing across str...
3                   Apocalypse lighting Spokane wildfires
4                  Typhoon Soudelor kills 28 China Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm RI worse last hurricane My cityamp3other...
3260      Green Line derailment Chicago httptcoUtbXLcBIuY
3261    MEG issues Hazardous Weather Outlook HWO httpt...
3262    CityofCalgary activated Municipal Emergency Pl...
Name: text, Length: 3263, dtype: object

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [17]:
X = cv.fit_transform(X_train_df)
X = X.toarray()

In [18]:
X_test = cv.transform(X_test_df)

In [19]:
cv.get_feature_names_out()

array(['0011', '001116', '0025', ..., 'ûónegligence', 'ûótech', 'ûówe'],
      dtype=object)

In [20]:
X.shape

(7613, 22380)

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(6090, 22380) (6090,)
(1523, 22380) (1523,)


In [23]:
X_test.shape

(3263, 22380)

In [24]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(2, activation="softmax")
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [25]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f56b808a340>

In [26]:
model.predict(X_test)



array([[0.25543275, 0.7445674 ],
       [0.1585976 , 0.8414025 ],
       [0.04461766, 0.9553824 ],
       ...,
       [0.01604349, 0.9839565 ],
       [0.63692534, 0.36307466],
       [0.18682645, 0.81317353]], dtype=float32)

In [27]:
ans = np.argmax(model.predict(X_test), 1)



In [28]:
ans_df = pd.DataFrame({"id": test_df.id, "target": ans})

In [29]:
ans_df.to_csv("outputs/ans3.csv", index=False)