In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium 
from folium import plugins 

from sklearn.metrics import accuracy_score

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
fig = plt.figure(figsize=(12,9))

ax1 = fig.add_subplot(121)
sns.barplot(df_train['keyword'].isnull().value_counts().index, df_train['keyword'].isnull().value_counts().values, palette='mako', ax=ax1)
ax1.set_title('Missing Values in Keyword')

ax2 = fig.add_subplot(122)
sns.barplot(df_train['location'].isnull().value_counts().index, df_train['location'].isnull().value_counts().values, palette='mako', ax=ax2)
ax2.set_title('Missing Values in Location')

fig.suptitle('Missing Values')
plt.show()

In [None]:
plt.figure(figsize=(12,9))
sns.barplot(df_train['target'].value_counts().index, df_train['target'].value_counts().values)
plt.title('Target Values')
plt.xlabel('0:not disaster|1:disaster')
plt.show()

In [None]:
df_tgroup = df_train.groupby('target').size()

df_tgroup.plot(kind='pie', subplots=True, figsize=(10, 8), autopct = "%.2f%%", colors=['blue','green'])
plt.title("Pie chart of Target",fontsize=16)
plt.legend()
plt.show()

In [None]:
data = df_train.location.value_counts()[:20]
data = pd.DataFrame(data)
data = data.reset_index()
data.columns = ['location', 'counts']

geolocator = Nominatim(user_agent='Location Map')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

dict_lat = {}
dict_long = {}
for i in data.location.values:
    print(i)
    location = geocode(i)
    dict_lat[i] = location.latitude
    dict_long[i] = location.longitude

data['latitude'] = data.location.map(dict_lat)
data['longitude'] = data.location.map(dict_long)

In [None]:
location_map = folium.Map(location=[7.0,7.0], zoom_start=2)
markers=2

for i,row in data.iterrows():
  loss = row['counts']
  if row['counts']>0:
    count = row['counts']*0.4
  folium.CircleMarker([float(row['latitude']), float(row['longitude'])], radius=float(count), color='red', fill=True).add_to(location_map)

location_map

In [None]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [None]:
def word_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# Replace all abbreviations
def replace_abbrev(text):
    string = ""
    for word in text.split():
        string += word_abbrev(word) + " "        
    return string
df_train['cleaned_text'] = df_train['text'].apply(replace_abbrev)
df_test['cleaned_text'] = df_test['text'].apply(replace_abbrev)

In [None]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words(['english'])
lem = WordNetLemmatizer()

print(stop_words)

def cleaning(data):
    #remove urls
    tweet_without_url = re.sub(r'http\S+',' ', data)

    #remove hashtags
    tweet_without_hashtag = re.sub(r'#\w+', ' ', tweet_without_url)

    #3. Remove mentions and characters that not in the English alphabets
    tweet_without_mentions = re.sub(r'@\w+',' ', tweet_without_hashtag)
    precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet_without_mentions)

    #2. Tokenize
    tweet_tokens = TweetTokenizer().tokenize(precleaned_tweet)
    
    #3. Remove Puncs
    tokens_without_punc = [w for w in tweet_tokens if w.isalpha()]
    
    #4. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]
    
    #6. Joining
    return " ".join(text_cleaned)

In [None]:
df_train['cleaned_text'] = df_train['cleaned_text'].apply(cleaning)
df_test['cleaned_text'] = df_test['cleaned_text'].apply(cleaning)

In [None]:
# df_train['cleaned_text'] = df_train['cleaned_text'].apply(stemming)
# df_test['cleaned_text'] = df_test['cleaned_text'].apply(stemming)

df_train['cleaned_text'] = df_train['cleaned_text'].apply(lambda x : x.lower())
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x : x.lower())

In [None]:
df_train.drop_duplicates(subset=['cleaned_text'], inplace=True)
df_test.drop_duplicates(subset=['cleaned_text'], inplace=True)

In [None]:
df_train.dropna(how='any', inplace=True, axis=1)
df_test.dropna(how='any', inplace=True, axis=1)

In [None]:
def collect_tokens(data, target):
    tokens = []
    
    for i in data[data['target'] == target]['cleaned_text'].str.split():
        for j in i:
            tokens.append(j)
    return tokens

In [None]:
disaster_tokens = collect_tokens(df_train, 1)
non_disaster_tokens = collect_tokens(df_train, 0)

In [None]:
plt.figure(figsize=(14,8))
word_cloud = WordCloud(background_color="white",max_font_size=60).generate(" ".join(disaster_tokens[:50]))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most words in disaster ', fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(14,8))
word_cloud = WordCloud(background_color="white",max_font_size=60).generate(" ".join(non_disaster_tokens[:50]))
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most words in non disaster ', fontsize=20)
plt.show()

## 1 Way

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

max_features = 5000
count_vectorizer = CountVectorizer(max_features=max_features)
sparce_matrix_train=count_vectorizer.fit_transform(df_train['cleaned_text'])
sparce_matrix_test=count_vectorizer.fit_transform(df_test['cleaned_text'])

def count_vector(data):
    count = CountVectorizer()
    vector = count.fit_transform(data)
    return vector, count_vectorizer

def tfidf_vector(data):
    tfidf = TfidfVectorizer()
    vector_tfidf = tfidf.fit_transform(data)
    return vector_tfidf, tfidf

X_train_count, count_vectorizer = count_vector(df_train['cleaned_text'])
X_train_tfidf, tfidf_vectorizer = tfidf_vector(df_train['cleaned_text'])

X_test_count = count_vectorizer.transform(df_test['cleaned_text'])                                                     
X_test_tfidf = tfidf_vectorizer.transform(df_test['cleaned_text'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, classification_report

np.random.seed(0)
random_state = 29

In [None]:
def fit_pred(model, X_train,X_test,y_train,y_test):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cmatx = confusion_matrix(y_test, y_pred)
    
    f,ax = plt.subplots(figsize=(3,3))
    sns.heatmap(cmatx,annot=True,linewidths=0.5,cbar=False,linecolor="red",fmt='.0f',ax=ax)
    plt.xlabel("y_predict")
    plt.ylabel("y_true")
    ax.set(title=str(clf))
    plt.show()
    
    train_accuracy = round(clf.score(X_train,y_train)*100)
    test_accuracy =  round(accuracy_score(y_test,y_pred)*100)
    
    print(classification_report(y_test,y_pred))    
    print('Accuracy of classifier on training set:{}%'.format(train_accuracy))
    print('Accuracy of classifier on test set:{}%' .format(test_accuracy))

In [None]:
models=[
        XGBClassifier(max_depth=6, n_estimators=1000),
        LogisticRegression(random_state=random_state),
        SVC(random_state=random_state),
        MultinomialNB(),
        DecisionTreeClassifier(random_state = random_state),
        KNeighborsClassifier(),
        RandomForestClassifier(random_state=random_state),
       ]

In [None]:
for m in models:
    y = df_train['target']
    print('COUNTVECTOR')
    
    X = X_train_count
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    fit_pred(m, X_train, X_test, y_train, y_test)
    
    print('TFIDFVECTOR')
    X = X_train_tfidf
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    fit_pred(m, X_train, X_test, y_train, y_test)

## 2 Way

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D, MaxPooling1D, GRU
from keras.models import load_model


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
X = df_train['cleaned_text']
y = pd.get_dummies(df_train['target']).values
num_classes = df_train['target'].nunique()

In [None]:
seed = 101 # fix random seed for reproducibility
np.random.seed(seed)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=seed)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [None]:
from tensorflow.keras.preprocessing import sequence
max_words = 30
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
print(X_train.shape,X_test.shape)

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Conv1D,MaxPooling1D,LSTM, Dropout
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

batch_size = 128
epochs = 20

max_features = 20000
embed_dim = 100

np.random.seed(seed)
K.clear_session()
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))    
# model.add(Dropout(0.5))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping

def callbacks():
  cb =[]
  reduceLROnPlat = ReduceLROnPlateau(monitor='val_loss',  
                                       factor=0.5, patience=1, 
                                       verbose=1, mode='min', 
                                       min_delta=0.0001, min_lr=0,
                                       restore_best_weights=True)
  cb.append(reduceLROnPlat)
  log = CSVLogger('log.csv')
  cb.append(log)

  es = EarlyStopping(monitor='val_loss', patience=5, verbose=0,
                       mode='min', restore_best_weights=True)
  cb.append(es)

  return cb

In [None]:
history = model.fit(
    X_train, 
    y_train, 
    validation_data=(X_test, y_test),
    epochs=epochs, 
    batch_size=batch_size, 
    verbose=2,
    callbacks = callbacks()
)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
# df_test['cleaned_text']
test_data = tokenizer.texts_to_sequences(df_test['cleaned_text'])
test_data = sequence.pad_sequences(test_data, maxlen=max_words)

In [None]:
pred = model.predict(test_data)
pred

In [None]:
idx = [x for x in df_test['id']]
target = [x for x in np.argmax(pred,axis=1)]

In [None]:
submit = pd.DataFrame({
    'id':idx,
    'target':target
})

In [None]:
submit

In [None]:
submit.to_csv('submission.csv', index=False)