In [6]:
# remove words of size 2

In [1]:
import pandas as pd
import numpy as np



In [2]:
#
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

import re

In [7]:
df = pd.read_csv(r'preprocessed_clean_text.csv')

In [10]:
df.drop(['id','keyword','location'],axis=1, inplace=True)

KeyError: "['id'] not found in axis"

In [11]:
df.head()

Unnamed: 0,target,text
0,1,deed reason allah forgive
1,1,forest fire near la range sask canada
2,1,resident ask shelter place notify officer evac...
3,1,people receive evacuation order california
4,1,get send photo ruby smoke pour school


In [None]:
def preprocessor(dataframe):
    pass

In [9]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Tweet Tokenizer


In [10]:
tweeter = TweetTokenizer(strip_handles=True,reduce_len=True, preserve_case=False)

In [11]:
df['tweets'] = df['text'].apply(lambda text: tweeter.tokenize(text))

In [12]:
df['tweets']

0       [our, deeds, are, the, reason, of, this, #eart...
1        [forest, fire, near, la, ronge, sask, ., canada]
2       [all, residents, asked, to, ', shelter, in, pl...
3       [13,000, people, receive, #wildfires, evacuati...
4       [just, got, sent, this, photo, from, ruby, #al...
                              ...                        
7608    [two, giant, cranes, holding, a, bridge, colla...
7609    [the, out, of, control, wild, fires, in, calif...
7610    [m1, ., 94, [, 01:04, utc, ], ?, 5km, s, of, v...
7611    [police, investigating, after, an, e-bike, col...
7612    [the, latest, :, more, homes, razed, by, north...
Name: tweets, Length: 7613, dtype: object

In [13]:
lemma = WordNetLemmatizer()

In [14]:
def lemmatizing(sentence_array):
    lemmas = nltk.pos_tag(sentence_array)
    container = []
    for word,lem in lemmas:
        pos = get_wordnet_pos(lem)
        if pos:
            container.append(lemma.lemmatize(word,pos=pos))
        else:
            container.append(lemma.lemmatize(word))
    return container

In [15]:
df['lemmed'] = df['tweets'].apply(lambda array: lemmatizing(array))

In [16]:
df['lemmed']

0       [our, deed, be, the, reason, of, this, #earthq...
1        [forest, fire, near, la, ronge, sask, ., canada]
2       [all, resident, ask, to, ', shelter, in, place...
3       [13,000, people, receive, #wildfires, evacuati...
4       [just, get, send, this, photo, from, ruby, #al...
                              ...                        
7608    [two, giant, crane, hold, a, bridge, collapse,...
7609    [the, out, of, control, wild, fire, in, califo...
7610    [m1, ., 94, [, 01:04, utc, ], ?, 5km, s, of, v...
7611    [police, investigate, after, an, e-bike, colli...
7612    [the, late, :, more, home, raze, by, northern,...
Name: lemmed, Length: 7613, dtype: object

In [17]:
def cleaner(sentence_array):
    sentence = " ".join(sentence_array)
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = re.sub('[()!?]', ' ', sentence)
    sentence = re.sub('[()!?]', ' ', sentence)
    sentence = re.sub('\[.*?\]',' ', sentence)
    sentence = re.sub("[^a-z0-9]"," ", sentence)
    sentence = re.sub(r'\b\w{1,3}\b', '', sentence)
    sentence = [w for w in sentence.split() if not w in stopwords.words('english')]

    return " ".join(sentence)
        

In [18]:
df['cleaned'] = df['lemmed'].apply(lambda array: cleaner(array))

In [19]:
df['cleaned']

0                    deed reason earthquake allah forgive
1                      forest fire near ronge sask canada
2       resident shelter place notify officer evacuati...
3       people receive wildfires evacuation order cali...
4       send photo ruby alaska smoke wildfires pour sc...
                              ...                        
7608         giant crane hold bridge collapse nearby home
7609    control wild fire california even northern par...
7610                                       volcano hawaii
7611    police investigate bike collided little portug...
7612     late home raze northern california wildfire news
Name: cleaned, Length: 7613, dtype: object

In [20]:
df['cleaned'].isna().sum()

0

# Machine learning

In [26]:
from sklearn.model_selection import train_test_split

#reminder to build preprocessor function and insert in the vectorizers instead of using the default
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


from keras import models, layers, optimizers

In [27]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'],df['target'],test_size = .2, random_state = 42)

## CountVectorizer

In [28]:
counter = CountVectorizer()

In [29]:
counted_vect = counter.fit_transform(X_train)

In [30]:
counted_vect.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
counted_df = pd.DataFrame(data=counted_vect.todense(),columns=counter.get_feature_names())

In [32]:
counted_df

Unnamed: 0,aaaaaaallll,aaarrrgghhh,ab,aba,abandon,abbott,abbswinston,abe,aberdeen,ability,...,zionist,zionists,zip,zipped,zipper,zombie,zone,zoom,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6076,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_train_c = counter.transform(X_train)
X_test_c = counter.transform(X_test)

In [34]:
log = LogisticRegression()
log.fit(X_train_c,y_train)

LogisticRegression()

In [35]:
log.score(X_test_c,y_test)

0.7928994082840237

## TFidf Vectorizer

In [36]:
tfidf = TfidfVectorizer()
tf_vect = tfidf.fit_transform(X_train)
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [37]:
logtf = LogisticRegression()
logtf.fit(X_train_tf,y_train)

LogisticRegression()

In [38]:
logtf.score(X_test_tf,y_test)

0.8034188034188035

### TFIDF part 2

In [None]:
tfidf2 = TfidfVectorizer()
tf_vect2 = tfidf2.fit_transform(X_train)
X_train_tf2 = tfidf2.transform(X_train)
X_test_tf2 = tfidf2.transform(X_test)

# Neural Networks

In [21]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('target')
    df = df['text']
    ds = tf.data.Dataset.from_tensor_slices((df, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

NameError: name 'data' is not defined

In [107]:
temp

['m1',
 '.',
 '94',
 '[',
 '01:04',
 'utc',
 ']',
 '?',
 '5km',
 's',
 'of',
 'volcano',
 'hawaii',
 '.',
 'http://t.co/zDtoyd8EbJ']

In [108]:
cleaner(temp)

['volcano', 'hawaii']

In [76]:
temp = df['lemmed'].iloc[7610]

In [77]:
temp

['m1',
 '.',
 '94',
 '[',
 '01:04',
 'utc',
 ']',
 '?',
 '5km',
 's',
 'of',
 'volcano',
 'hawaii',
 '.',
 'http://t.co/zDtoyd8EbJ']

In [79]:
new_temp = re.sub(r"http\S+", "", " ".join(temp))

In [80]:
new_temp

'm1 . 94 [ 01:04 utc ] ? 5km s of volcano hawaii . '

In [81]:
new_temp2 = re.sub('[()!?]', ' ', new_temp)

In [82]:
new_temp2

'm1 . 94 [ 01:04 utc ]   5km s of volcano hawaii . '

In [84]:
new_temp3 = re.sub('\[.*?\]',' ', new_temp2)

In [85]:
new_temp3

'm1 . 94     5km s of volcano hawaii . '

In [86]:
new_temp4 = re.sub("[^a-z0-9]"," ", new_temp3)

In [87]:
new_temp4

'm1   94     5km s of volcano hawaii   '

In [94]:
test_stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]

temp5 = [w for w in new_temp4.split() if not w in test_stopwords]
temp6 = " ".join(word for word in temp5)

In [95]:
temp6

'm1 94 5km s volcano hawaii'

In [103]:
temp7 = re.sub(r'\b\w{1,3}\b', '', temp6)

In [105]:
temp7.split()

['volcano', 'hawaii']

In [99]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [100]:
container2 = []
for word in stopwords.words('english'):
    if len(word)==1:
        container2.append(word)

In [101]:
container2

['i', 'a', 's', 't', 'd', 'm', 'o', 'y']

In [19]:
lemma.lemmatize("dogs")

'dog'

In [22]:
lemma.lemmatize("going",pos = "v")

'go'

In [26]:
for word in "dogs going to dancing".split():
    print(lemma.lemmatize(word))


dog
going
to
dancing


In [29]:
nltk.pos_tag(['dog'])

[('dog', 'NN')]

In [30]:
nltk.pos(['going'])

AttributeError: module 'nltk' has no attribute 'pos'

In [54]:
sentence = "dogs going to be playing"

In [56]:
test_lemmas = lemma.lemmatize(sentence)

In [60]:
test_lemmas = nltk.pos_tag(sentence.split())

In [61]:
test_lemmas

[('dogs', 'NNS'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('playing', 'VBG')]

In [64]:
container = []
for word,lem in test_lemmas:
    pos = get_wordnet_pos(lem)
    if pos:
        container.append(lemma.lemmatize(word,pos=pos))
    else:
        container.append(lemma.lemmatize(word))

In [65]:
container

['dog', 'go', 'to', 'be', 'play']

In [52]:
get_wordnet_pos(nltk.pos_tag(['dog'])[0][1])

'n'

In [41]:
tag = nltk.pos_tag(['going'])

In [50]:
get_wordnet_pos(tag[0][1])

'v'

In [53]:
tag

[('going', 'VBG')]