# Twitter Disaster Detection

## Project Preperation

### Imports

#### PYthon Imports

In [30]:
import pandas as pd
from IPython.display import display
import configparser
import modules.pred_word_vectorizer as wv
import modules.pred_decision_trees as decision_trees
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, accuracy_score

#### Settings Import

In [31]:
settings = configparser.ConfigParser()
settings.read('main.ini')
file_path = settings['FILE']['file']

#### Import Source File into DataFrame

In [32]:
df = pd.read_csv(file_path)

### Data Exploration

In [33]:
# mod_de.data_exploration(df)

### Data Cleaning

In [34]:
wv.cleaning(df)
df

Unnamed: 0,id,keyword,location,text,target,clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason this earthquake may a...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked shelter place are being ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska smok...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding bridge collapse into ...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,@@@ @@@ the out control wild fires california ...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc s volcano hawaii http
7611,10872,,,Police investigating after an e-bike collided ...,1,police investigating after e bike collided wit...


### Word-Vectorizer

In [None]:
wv.lemmatizer(df)
df

In [None]:
X_train, X_test, y_train, y_test = wv.vectorizer(df)

df_test = wv.modeler(df, X_train, X_test, y_train, y_test)
df_test

### Random Tree

In [36]:
X = wv.simple_vectorizer(df)
y = df['target']

X_train  (6090, 12477)
y_train  (6090, 1)
X_test   (1523, 12477)
y_test   (1523, 1)
matrix shape (1523, 12477)
y shape (1523,)
accuracy_score 0.81


  return f(**kwargs)


Unnamed: 0,id,keyword,location,text,target,clean,y_pred
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,1
10,16,,,Three people died from the heat wave so far,1,three people died heat wave far,1
12,18,,,#raining #flooding #Florida #TampaBay #Tampa 1...,1,raining flooding florida tampabay tampa day lo...,1
19,28,,,What a goooooooaaaaaal!!!!!!,0,goooooooaaaaaal,0
24,36,,,LOOOOOOL,0,looooool,0
...,...,...,...,...,...,...,...
7591,10846,,,Heat wave warning aa? Ayyo dei. Just when I pl...,1,heat wave warning ayyo dei plan visit friend year,1
7598,10853,,,Father-of-three Lost Control of Car After Over...,1,father three lost control car overtaking colli...,1
7605,10864,,,on the flip side I'm at Walmart and there is a...,1,flip side walmart bomb everyone evacuate stay ...,1
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1,suicide bomber kill saudi security site mosque...,1


### Random Tree

In [37]:
X = wv.simple_vectorizer(df)
y = df['target']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6090, 12477) (1523, 12477) (6090,) (1523,)


In [39]:
print('DecisionTreeClassifier:' , decision_trees.cvs_decision_tree(X_train, y_train))
print('AdaBoostClassifier:' , decision_trees.csv_adaboost(X_train, y_train))

## Probleme local mit py xgboost / check later
# print('xgboost:' , decision_trees.csv_xgboost(X_train, y_train))

Model: DecisionTreeClassifier()
Training set score: 0.986863711001642
CV score: 0.7371100164203612
DecisionTreeClassifier: [1 0 1 ... 0 1 1]
Model: AdaBoostClassifier()
Training set score: 0.7656814449917898
CV score: 0.7380952380952381
AdaBoostClassifier: [1 0 1 ... 0 1 0]


In [40]:
### playing with Pipelines

In [41]:
# testing with full set
# todo: train test split and other features

X = df.clean
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

pipe_tf = Pipeline(steps = [
    ('data_vec', TfidfVectorizer()),
    ('model', MultinomialNB())
])
print('MultinomialNB cross_val_score, TfidfVectorizer :', cross_val_score(pipe_tf, X_train, y_train).mean())

pipe_cv =  Pipeline(steps = [
    ('data_cv', CountVectorizer()),
    ('model', MultinomialNB())
])
print('MultinomialNB cross_val_score, CountVectorizer :', cross_val_score(pipe_cv,  X_train, y_train).mean())


pipe_comb =  Pipeline(steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('model', MultinomialNB())
])
print('MultinomialNB cross_val_score, pipe_comb :', cross_val_score(pipe_comb,  X_train, y_train).mean())

(6090,) (1523,) (6090,) (1523,)
MultinomialNB cross_val_score, TfidfVectorizer : 0.7967159277504104
MultinomialNB cross_val_score, CountVectorizer : 0.7970443349753695
MultinomialNB cross_val_score, pipe_comb : 0.7967159277504104


In [42]:


pipe_tf.fit(X_train, y_train)
pipeline_preds = pipe_tf.predict(X_test)
print('Naive Bayes:')
print('accuracy_score: TfidfVectorizer', accuracy_score(y_test, pipeline_preds))
val_confusion_matrix = confusion_matrix(y_test, pipeline_preds)
print(f'Confusion Matrix: \n{val_confusion_matrix}')

print()

pipe_cv.fit(X_train, y_train)
pipeline_preds = pipe_cv.predict(X_test)
print('accuracy_score: CountVectorizer', accuracy_score(y_test, pipeline_preds))
val_confusion_matrix = confusion_matrix(y_test, pipeline_preds)
print(f'Confusion Matrix: \n{val_confusion_matrix}')

print()

pipe_comb.fit(X_train, y_train)
pipeline_preds = pipe_comb.predict(X_test)
print('accuracy_score: combined', accuracy_score(y_test, pipeline_preds))
val_confusion_matrix = confusion_matrix(y_test, pipeline_preds)
print(f'Confusion Matrix: \n{val_confusion_matrix}')


Naive Bayes:
accuracy_score: TfidfVectorizer 0.8017071569271176
Confusion Matrix: 
[[779  95]
 [207 442]]

accuracy_score: CountVectorizer 0.7977675640183848
Confusion Matrix: 
[[740 134]
 [174 475]]

accuracy_score: combined 0.8017071569271176
Confusion Matrix: 
[[779  95]
 [207 442]]
