In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# EDA

In [3]:
#Check for class balance
train.target.value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [4]:
train.shape

(7613, 5)

In [5]:
train[train.keyword.notnull()].sample(10)

Unnamed: 0,id,keyword,location,text,target
2256,3235,deluged,,Businesses are deluged with inroices.|Make you...,0
1736,2500,collided,"Nairobi, Kenya",Stepkans Media - Two confirmed dead after the ...,1
704,1017,blazing,Konoha,@__srajapakse__ Why thank you there missy ?? t...,0
125,180,aftershock,304,Sometimes you face difficulties not because yo...,0
2326,3342,demolished,"Catalonia, Spain",Demolished My Personal Best http://t.co/ImULL...,0
2060,2954,dead,Afghanistan,17 dead as Afghanistan aircraft crashes: An Af...,1
4341,6165,hijack,,Swansea Û÷plot hijack transfer move for South...,1
7125,10207,violent%20storm,3rd Eye Chakra,Violent video: Ukraine rioters brutally beat p...,1
1991,2864,damage,,It's crazy how a phone can do so much damage t...,0
1721,2483,collided,Pakistan,SSP East says a car AEG 061 driven by a young ...,1


In [6]:
print("Percentage of Targets that have Null Keyword")
print(train[train.target == 0].keyword.isna().sum()/train[train.target == 0].shape[0])
print(train[train.target == 1].keyword.isna().sum()/train[train.target == 1].shape[0])

Percentage of Targets that have Null Keyword
0.004375863657300783
0.012840110058086213


In [7]:
print("Percentage of Targets that have Null Locatoin")
print(train[train.target == 0].location.isna().sum()/train[train.target == 0].shape[0])
print(train[train.target == 1].location.isna().sum()/train[train.target == 1].shape[0])

Percentage of Targets that have Null Locatoin
0.33578995854444954
0.32864567410577805


### Drop Keyword for now, because a keyword is present in text technically
### Drop Location because I want the model to learn based on text content, and not location yet

# Preprocessing
Text preprocessing functions from geeksforgeeks

In [8]:
#Drop keyword and Location
try:
    train = train.drop(['keyword', 'location'], axis=1)
except:
    print("columns already dropped")

In [9]:
#Lowercase 
train.text = train.text.str.lower()

In [10]:
import string
# From geeksforgeeks.org
# remove punctuation 
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 

In [11]:
train.text.iloc[12]

"#raining #flooding #florida #tampabay #tampa 18 or 19 days. i've lost count "

In [12]:
train_clean = train
train_clean.text = train_clean.text.apply(remove_punctuation)

In [13]:
train.text.iloc[12]

'raining flooding florida tampabay tampa 18 or 19 days ive lost count '

In [14]:
train_clean.text.iloc[12]

'raining flooding florida tampabay tampa 18 or 19 days ive lost count '

## Removing Numbers from text, but might inflect instead
Rationale: Don't want to cheat (ex. multiple tweets about an earthquake that lasted 10 days) 

In [15]:
#Remove Numbers
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 
  
input_str = "13000 people receive wildfires evacuation orders in california "
remove_numbers(input_str) 

' people receive wildfires evacuation orders in california '

In [16]:
#Remove English Stopwords
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def remove_stopwords(text): 
    stop_words = set(stopwords.words("english"))
    re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
    filtered_text =  re_stop_words.sub(" ", text)
    return filtered_text 
  
example_text = "raining flooding florida tampabay tampa 18 or 19 days ive lost count "
remove_stopwords(example_text) 

'raining flooding florida tampabay tampa 18  19 days ive lost count '

In [17]:
# Stemmer
from nltk.stem.porter import PorterStemmer 
stemmer = PorterStemmer() 
  
# stem words in the list of tokenised words 
def stem_words(text): 
    stems = [stemmer.stem(word) for word in text.split()] 
    return " ".join(stems)
  
text = "raining flooding florida tampabay tampa 18 or 19 days ive lost count "
stem_words(text) 

'rain flood florida tampabay tampa 18 or 19 day ive lost count'

In [18]:
def preprocess(df):
    try:
        df.drop(['keyword', 'location'], axis=1, inplace=True)
    except:
        print("columns already dropped")
    df.text = df.text.apply(remove_punctuation)
    df.text = df.text.apply(remove_numbers)
    df.text = df.text.apply(remove_stopwords)
    df.text = df.text.apply(stem_words)
    return df

In [19]:
train_clean = preprocess(train_clean)

columns already dropped


In [20]:
train_clean

Unnamed: 0,id,text,target
0,1,deed reason earthquak may allah forgiv us all,1
1,4,forest fire near la rong sask canada,1
2,5,resid ask shelter place notifi offic evacu she...,1
3,6,peopl receiv wildfir evacu order california,1
4,7,got sent photo rubi alaska smoke wildfir pour ...,1
5,8,rockyfir updat california hwi close direct due...,1
6,10,flood disast heavi rain caus flash flood stree...,1
7,13,im top hill see fire wood,1
8,14,there emerg evacu happen build across street,1
9,15,im afraid tornado come area,1


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), strip_accents="unicode")
feature_matrix = tfidf.fit_transform(train_clean.text)

In [22]:
train_x = pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
train_y = train.target

## Note to self: Remove weird letters

In [31]:
#Split train validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [32]:
from sklearn.svm import LinearSVC

In [33]:
model = LinearSVC()
model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [34]:
from sklearn.metrics import f1_score

In [35]:
pred_val = model.predict(X_val)

In [38]:
f1_score(y_val, pred_val)

0.7272727272727273

### Use test data

In [25]:
test = pd.read_csv('test.csv')
test = preprocess(test)

In [26]:
test = test.drop(['id'], axis=1)

In [27]:
test_feature_matrix = tfidf.transform(test.text)

In [28]:
test = pd.DataFrame(test_feature_matrix.toarray(), columns=tfidf.get_feature_names())

In [29]:
pred = model.predict(test)