In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import string

In [2]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")


In [3]:
train = df1.filter(["id", "keyword", "text", "target"], axis=1)
test = df2.filter(["id", "keyword", "text"], axis=1)

In [4]:
import re
def find_hashtags(tweet):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"#\w+", tweet)]) or 'no'
def find_mentions(tweet):
    return " ".join([match.group(0)[1:] for match in re.finditer(r"@\w+", tweet)]) or 'no'
def find_links(tweet):
    return " ".join([match.group(0)[:] for match in re.finditer(r"https?://\S+", tweet)]) or 'no'
def tolkencleaner(df):
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    ps = PorterStemmer()
    df['keyword'] = df['keyword'].astype(str)
    tolkenized = []
    for tweet in df.text:
        no_link = re.sub(r'https?://\S+', '', tweet)
        no_break = re.sub(r'\n',' ', no_link)
        cleaner = re.sub('\s+', ' ', no_break).strip()
        word_tokens = word_tokenize(cleaner)
        stripped = [ps.stem(w) for w in word_tokens if not w in punctuation]
        split_sentence = [w for w in stripped if not w in stop_words]
        filtered_sentence = [w.lower() for w in split_sentence]
        finished_sentence = ' '.join(filtered_sentence)
        tolkenized.append(finished_sentence)
    df['tolkenized'] = tolkenized
    df['hashtag'] = df['text'].apply(lambda x: find_hashtags(x))
    df['mentions'] = df['text'].apply(lambda x: find_mentions(x))
    df['links'] = df['text'].apply(lambda x: find_links(x))

In [5]:
tolkencleaner(train)
train.head()

Unnamed: 0,id,keyword,text,target,tolkenized,hashtag,mentions,links
0,1,,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,earthquake,no,no
1,4,,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,no,no,no
2,5,,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place notifi offic no evacu...,no,no,no
3,6,,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv wildfir evacu order califo...",wildfires,no,no
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,Alaska wildfires,no,no


In [6]:
tolkencleaner(test)
test.head()

Unnamed: 0,id,keyword,text,tolkenized,hashtag,mentions,links
0,0,,Just happened a terrible car crash,happen terribl car crash,no,no,no
1,2,,"Heard about #earthquake is different cities, s...",heard earthquak differ citi stay safe everyon,earthquake,no,no
2,3,,"there is a forest fire at spot pond, geese are...",forest fire spot pond gees flee across street ...,no,no,no
3,9,,Apocalypse lighting. #Spokane #wildfires,apocalyps light spokan wildfir,Spokane wildfires,no,no
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan,no,no,no


In [7]:

def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "yes")
    df[text_field] = df[text_field].str.replace(r"http", "yes")
    df[text_field] = df[text_field].str.replace(r"@\S+", " ")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.replace("nan", "no")
    
    df[text_field] = df[text_field].str.lower()
    return df

train = standardize_text(train,  "links")
test = standardize_text(test,  "links")



In [8]:
train = standardize_text(train,  "keyword")
test = standardize_text(train,  "keyword")

In [9]:
test.head()

Unnamed: 0,id,keyword,text,target,tolkenized,hashtag,mentions,links
0,1,no,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,earthquake,no,no
1,4,no,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,no,no,no
2,5,no,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place notifi offic no evacu...,no,no,no
3,6,no,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv wildfir evacu order califo...",wildfires,no,no
4,7,no,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,Alaska wildfires,no,no


In [9]:
train.to_csv("clean_data.csv")
train.head()

Unnamed: 0,id,keyword,text,target,tolkenized,hashtag,mentions,links
0,1,no,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,earthquake,no,no
1,4,no,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,no,no,no
2,5,no,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place notifi offic no evacu...,no,no,no
3,6,no,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv wildfir evacu order califo...",wildfires,no,no
4,7,no,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,Alaska wildfires,no,no


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv_hashtag = CountVectorizer(min_df = 5)
cv_mentions = CountVectorizer(min_df = 5)
ht = cv_hashtag.fit_transform(train['hashtag'])
mt = cv_mentions.fit_transform(train['mentions'])
train_ht = pd.DataFrame(ht.toarray(), columns=cv_hashtag.get_feature_names())
train_mt = pd.DataFrame(mt.toarray(), columns=cv_mentions.get_feature_names())
print(train_ht.shape)
print(train_mt.shape)


(7613, 107)
(7613, 18)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec_text = TfidfVectorizer(min_df = 10, ngram_range = (1,2), stop_words='english') 
vt = vec_text.fit_transform(train['tolkenized'])
train_txt = pd.DataFrame(vt.toarray(), columns=vec_text.get_feature_names())
print (train_txt.shape)

(7613, 1657)


In [13]:
train_txt.columns

Index(['00', '01', '04', '05', '06', '07', '08', '08 05', '08 06', '10',
       ...
       'zone', 'û_', 'ûª', 'ûªt', 'ûªv', 'ûï', 'ûïwhen', 'ûïwhen saw', 'ûò',
       'ûó'],
      dtype='object', length=1657)

In [14]:
X = pd.DataFrame(train)
X = X.join(train_ht, rsuffix='_hashtags')
X = X.join(train_mt, rsuffix='_mentions')
X = X.join(train_txt, rsuffix='_text')
X.head()

Unnamed: 0,id,keyword,text,target,tolkenized,hashtag,mentions,links,abstorm,africa,...,zone,û_,ûª,ûªt,ûªv,ûï,ûïwhen,ûïwhen saw,ûò,ûó
0,1,no,Our Deeds are the Reason of this #earthquake M...,1,deed reason thi earthquak may allah forgiv us,earthquake,no,no,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,no,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,no,no,no,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,no,All residents asked to 'shelter in place' are ...,1,resid ask 'shelter place notifi offic no evacu...,no,no,no,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,no,"13,000 people receive #wildfires evacuation or...",1,"13,000 peopl receiv wildfir evacu order califo...",wildfires,no,no,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,no,Just got sent this photo from Ruby #Alaska as ...,1,got sent thi photo rubi alaska smoke wildfir p...,Alaska wildfires,no,no,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df=X.drop(columns=["id","text","tolkenized"])

In [16]:
df.head()

Unnamed: 0,keyword,target,hashtag,mentions,links,abstorm,africa,afterlife,allah,animalrescue,...,zone,û_,ûª,ûªt,ûªv,ûï,ûïwhen,ûïwhen saw,ûò,ûó
0,no,1,earthquake,no,no,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,no,1,no,no,no,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,no,1,no,no,no,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,no,1,wildfires,no,no,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,no,1,Alaska wildfires,no,no,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df.to_csv("clean_train.csv", index=False)

In [18]:
y = train.target
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [27]:
df.columns

Index(['keyword', 'target', 'hashtag', 'mentions', 'links', 'abstorm',
       'africa', 'afterlife', 'allah', 'animalrescue',
       ...
       'zone', 'û_', 'ûª', 'ûªt', 'ûªv', 'ûï', 'ûïwhen', 'ûïwhen saw', 'ûò',
       'ûó'],
      dtype='object', length=1787)

In [20]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,1 hour 32 mins
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,8 days
H2O cluster name:,H2O_from_python_floPe_dk7bea
H2O cluster total nodes:,1
H2O cluster free memory:,1.459 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [28]:
# Identify predictors and response
x = df.columns
y = "target"



In [29]:
x.drop(y)

Index(['keyword', 'hashtag', 'mentions', 'links', 'abstorm', 'africa',
       'afterlife', 'allah', 'animalrescue', 'antioch',
       ...
       'zone', 'û_', 'ûª', 'ûªt', 'ûªv', 'ûï', 'ûïwhen', 'ûïwhen saw', 'ûò',
       'ûó'],
      dtype='object', length=1786)

AttributeError: 'Series' object has no attribute 'asfactor'

In [31]:

# Run AutoML for 30 seconds
aml = H2OAutoML(max_runtime_secs = 30)
aml.train(x = x, y = y,
          training_frame = df,
          leaderboard_frame = test)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb

H2OTypeError: 'training_frame' must be a valid H2OFrame!

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

ValueError: could not convert string to float: 'epicentre'

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
x = train.columns
y = "target"
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
# Run AutoML for 60 seconds
aml = H2OAutoML(max_runtime_secs = 60)
aml.train(x = x, y = y, training_frame = train, leaderboard_frame = test)
# View the AutoML Leaderboard
aml.leaderboard
aml.leader
# To generate predictions on a test set, use `"H2OAutoML"` object, or on the leader model object directly as below:
preds = aml.predict(test)