In [164]:
# Run this cell
import numpy as np
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

%matplotlib inline
import re
import nltk
import string
# run this in cmd to avoid an error beforehand for this import
# python -m nltk.downloader stopwords punkt
from nltk.corpus import stopwords

import random
from sklearn.feature_extraction.text import TfidfVectorizer


In [165]:
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


In [166]:
df = pd.read_csv('train.csv')
df["text"] = df["text"].str.lower()
df["text"] = df["text"].apply(lambda text: remove_urls(text))
df["text"] = df["text"].apply(lambda text: remove_punctuation(text))
df["text"] = df["text"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1


In [167]:
def randNoun(lines):
    # function to test if something is a noun
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    tokenized = nltk.word_tokenize(lines)
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] 
    if (len(nouns) == 0): # if no nouns picked up return rand word in text
        return lines[random.randrange(len(lines))]
    return nouns[random.randrange(len(nouns))]

In [168]:
# preprocessing of the dataset
print(len(df["location"]))
words = {} # will hold unique keywords
count = 0
# getting rid of %20 in keyword and getting a list of all the unqiue keywords
for i in df["keyword"]:
    i = str(i).replace("%20", ' ')
    if (i not in words.keys() and i != "nan"):
        words[i] = count
        count+=1

# filling in missing feature values
# missing keywords get filled in with a keywords already in "words" if that keyword shows up in text, otherwise pick a random word from text
# Missing "location" gets filled in with "Earth"
for i,j in df.iterrows():
    if (pd.isna(j["keyword"])):
        for k in words.keys():
            if k in j["text"]:
                df.at[i, "keyword"] = k
            else:
                df.at[i, "keyword"] = randNoun(j["text"])
    if (pd.isna(j["location"])):
        df.at[i, "location"] = "Earth"
# for i, j in df.iterrows():
#     if (pd.isna(j["location"])):
#         df.drop(i, axis=0, inplace=True)

print(len(df["text"]))


7613
7613


In [169]:
# df.reset_index(inplace=True, drop=True)
# df.head()


In [170]:
# we are going to feature transform 
corpus = df["text"].to_list() # combine all text to a bag of words
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray() # TF-IDF text encoding

#doing hot and cold encoding for keywords
words = {} # will hold unique keywords with a corresponding index
count = 0
for i in df["keyword"]:
    if (i not in words.keys()):
        words[i] = count
        count+=1
print(words)
keywords_hot_cold = np.zeros((len(df["keyword"]), len(words.keys()))) # init feature shape
for i in range(keywords_hot_cold.shape[0]):
    row = np.zeros(keywords_hot_cold.shape[1])
    row[words[df.at[i, "keyword"]]] = 1 # using the index found in the dict, index the row and set the corresponding value of the keyword to 1
    keywords_hot_cold[i] = row

# keywords_vector =  np.zeros((len(df["keyword"]), 1)) # column vector to store possible values
# for i in range(keywords_vector.shape[0]):
#     keywords_vector[i] = words[df.at[i, "keyword"]] # find corresponding value of that keyword in the "words" dict and assign that value 
# print(keywords_vector)

#doing hot and cold encoding for locations
locations = {}
count = 0
for i in df["location"]:
    if (i not in locations.keys()):
        locations[i] = count
        count+=1
location_hot_cold = np.zeros((len(df["location"]), len(locations.keys())))
for i in range(location_hot_cold.shape[0]):
    row = np.zeros(location_hot_cold.shape[1])
    row[locations[df.at[i, "location"]]] = 1
    location_hot_cold[i] = row

# locations_vector =  np.zeros((len(df["location"]), 1)) # column vector to store possible values
# for i in range(locations_vector.shape[0]):
#     locations_vector[i] = locations[df.at[i, "location"]] # find corresponding value of that keyword in the "words" dict and assign that value 

# scalar = StandardScaler()
# keywords_vector = scalar.fit_transform(keywords_vector)
# locations_vector = scalar.fit_transform(locations_vector)

# print(np.max(locations_vector))




{'reason': 0, 'sask': 1, 'residents': 2, 'evacuation': 3, 'ruby': 4, 'cafire': 5, 'springs': 6, 'fire': 7, 'building': 8, 'im': 9, 'people': 10, 'south': 11, 'days': 12, 'flood': 13, 'bus': 14, 'whats': 15, 'fruits': 16, 'summer': 17, 'car': 18, 'goooooooaaaaaal': 19, 'l': 20, 'london': 21, 'skiing': 22, 'day': 23, 'looooool': 24, 'cant': 25, 'week': 26, 'girlfriend': 27, 'cooool': 28, 'pasta': 29, 'end': 30, 'ablaze': 31, 'accident': 32, 'aftershock': 33, 'airplane%20accident': 34, 'ambulance': 35, 'annihilated': 36, 'annihilation': 37, 'apocalypse': 38, 'armageddon': 39, 'army': 40, 'arson': 41, 'arsonist': 42, 'attack': 43, 'attacked': 44, 'avalanche': 45, 'battle': 46, 'bioterror': 47, 'bioterrorism': 48, 'blaze': 49, 'blazing': 50, 'bleeding': 51, 'blew%20up': 52, 'blight': 53, 'blizzard': 54, 'blood': 55, 'bloody': 56, 'blown%20up': 57, 'body%20bag': 58, 'body%20bagging': 59, 'body%20bags': 60, 'bomb': 61, 'bombed': 62, 'bombing': 63, 'bridge%20collapse': 64, 'buildings%20burning

In [171]:
# print(keywords_hot_cold.shape)
# print(location_hot_cold.shape)


In [172]:
# combine all features and split into training and validation sets

# X = np.hstack((keywords_hot_cold))
# X = keywords_hot_cold
X_train, X_val,  y_train, y_val = train_test_split(X, df["target"], test_size=.4)

# X_train = scalar.fit_transform(X_train)
# X_val = scalar.fit_transform(X_val)

y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

poly = PolynomialFeatures(1)
X_train = poly.fit_transform(X_train)
X_val = poly.fit_transform(X_val)

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(4567, 17715) (4567,) (3046, 17715) (3046,)


In [173]:
from sklearn import linear_model
lmda = .0001
logreg = linear_model.LogisticRegression(penalty = 'l2', C=(1/lmda), max_iter=1000)
logreg.fit(X_train, y_train)

In [174]:
w_logreg = logreg.coef_
intercept_logreg = logreg.intercept_
y_hat_logreg = logreg.predict(X_val)

# Find the accuracy achieved on test set using logreg.score and y_test 
acc_train = logreg.score(X_train, y_train)
acc_logreg = logreg.score(X_val, y_val)

print("Q19 - Accuracy on training data = %f" % acc_train)

print("Q19 - Accuracy on val data = %f" % acc_logreg)

Q19 - Accuracy on training data = 0.990366
Q19 - Accuracy on val data = 0.774130


In [175]:

from sklearn.metrics import precision_recall_fscore_support
prec, recal, fscore, sup = precision_recall_fscore_support(y_val, y_hat_logreg)
print('Q20 - prec: ', prec)
print('Q20 - recal: ', recal)
print('Q20 - fscore: ', fscore)

Q20 - prec:  [0.77162447 0.77826087]
Q20 - recal:  [0.85157159 0.67394578]
Q20 - fscore:  [0.80962922 0.72235674]
