In [10]:
# Run this cell
import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

%matplotlib inline
import re
import nltk
import string
# run this in cmd to avoid an error beforehand for this import
# python -m nltk.downloader stopwords punkt
from nltk.corpus import stopwords

import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [11]:
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


In [12]:
df = pd.read_csv('train.csv')
df["text"] = df["text"].str.lower()
df["text"] = df["text"].apply(lambda text: remove_urls(text))
df["text"] = df["text"].apply(lambda text: remove_punctuation(text))
df["text"] = df["text"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [13]:
def randNoun(lines):
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    tokenized = nltk.word_tokenize(lines)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    if (len(nouns) == 0): # if no nouns picked up return rand word in text
        return lines[random.randrange(len(lines))]
    return nouns[random.randrange(len(nouns))]

In [14]:
# preprocessing of the dataset
print(len(df["location"]))
words = {} # will hold unique keywords
count = 0
# getting rid of %20 in keyword and getting a list of all the unqiue keywords
for i in df["keyword"]:
    i = str(i).replace("%20", ' ')
    if (i not in words.keys() and i != "nan"):
        words[i] = count
        count+=1

# filling in missing feature values
# missing keywords get filled in with a keywords already in "words" if that keyword shows up in text, otherwise pick a random word from text
# Missing "location" gets filled in with "Earth"
for i,j in df.iterrows():
    if (pd.isna(j["keyword"])):
        for k in words.keys():
            if k in j["text"]:
                df.at[i, "keyword"] = k
            else:
                df.at[i, "keyword"] = randNoun(j["text"])
    if (pd.isna(j["location"])):
        df.at[i, "location"] = "Earth"
# for i, j in df.iterrows():
#     if (pd.isna(j["location"])):
#         df.drop(i, axis=0, inplace=True)

print(len(df["text"]))


7613
7613


In [15]:
# we are going to feature transform 
corpus = df["text"].to_list() # combine all text to a bag of words
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray() # TF-IDF text encoding

#doing hot and cold encoding for keywords
words = {} # will hold unique keywords with a corresponding index
count = 0
for i in df["keyword"]:
    if (i not in words.keys()):
        words[i] = count
        count+=1
# print(words)
# keywords_hot_cold = np.zeros((len(df["keyword"]), len(words.keys()))) # init feature shape
# for i in range(keywords_hot_cold.shape[0]):
#     row = np.zeros(keywords_hot_cold.shape[1])
#     row[words[df.at[i, "keyword"]]] = 1 # using the index found in the dict, index the row and set the corresponding value of the keyword to 1
#     keywords_hot_cold[i] = row

keywords_vector =  np.zeros((len(df["keyword"]), 1)) # column vector to store possible values
for i in range(keywords_vector.shape[0]):
    keywords_vector[i] = words[df.at[i, "keyword"]] # find corresponding value of that keyword in the "words" dict and assign that value 
print(keywords_vector)

#doing hot and cold encoding for locations
locations = {}
count = 0
for i in df["location"]:
    if (i not in locations.keys()):
        locations[i] = count
        count+=1
# location_hot_cold = np.zeros((len(df["location"]), len(locations.keys())))
# for i in range(location_hot_cold.shape[0]):
#     row = np.zeros(location_hot_cold.shape[1])
#     row[locations[df.at[i, "location"]]] = 1
#     location_hot_cold[i] = row

locations_vector =  np.zeros((len(df["location"]), 1)) # column vector to store possible values
for i in range(locations_vector.shape[0]):
    locations_vector[i] = locations[df.at[i, "location"]] # find corresponding value of that keyword in the "words" dict and assign that value 

scalar = StandardScaler()
keywords_vector = scalar.fit_transform(keywords_vector)
locations_vector = scalar.fit_transform(locations_vector)

print(np.max(locations_vector))




[[  0.]
 [  1.]
 [  2.]
 ...
 [269.]
 [ 14.]
 [266.]]
2.390050825669785


In [16]:
# me trying to do data analytics. this sucks we have too many features :P
# curr_max = 0
# ind = -1
# for i in range(X.shape[1]):
#     total = np.sum(X[:,i])
#     if (52.55877985596134 > total > curr_max):
#         curr_max = total
#         ind = i
# print(curr_max)
# print(ind)
# 7958 9347 6168 6753
# for i in range(0, len(df["target"])):
#     if (X[i, 7958] != 0 and X[i, 9347] != 0):
#         if (df.at[i, "target"] == 0):
#             plt.scatter(X[i, 7958], X[i, 9347], c='r')
#         else:
#             plt.scatter(X[i, 7958], X[i, 9347], c='b')
#     if (X[i, 6168] != 0 and X[i, 6753] != 0):
#         if (df.at[i, "target"] == 0):
#             plt.scatter(X[i, 6168], X[i, 6753], c='green')
#         else:
#             plt.scatter(X[i, 6168], X[i, 6753], c='yellow')

    
    


# print(np.max(X[:,7958]))

# plt.show()

In [17]:
# combine all features and split into training and validation sets

X = np.hstack((keywords_vector, locations_vector, X))
y =  df["target"]


X_train, X_val,  y_train, y_val = train_test_split(X, df["target"], test_size=.4)
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()



clf = SVC(gamma=.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
acc = clf.predict(y_val, y_pred)
print(acc)
# X_train = scalar.fit_transform(X_train)
# X_val = scalar.fit_transform(X_val)



print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)