In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

# suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [41]:
# read the data
df = pd.read_csv('https://raw.githubusercontent.com/nikjohn7/Disaster-Tweets-Kaggle/main/data/train.csv')
df.sample(20)

Unnamed: 0,id,keyword,location,text,target
2095,3011,death,New York,Xbox 360 Pro Console - *Red Ring of Death* - F...,0
7276,10415,whirlwind,Richardson TX,???? throwback Thurs ?? ???? Will You Still L...,0
4478,6370,hostages,,New #Free #Porn #Clip! Taking Of Hostages Dang...,0
342,490,armageddon,,Paul Craig Roberts ÛÒ Vladimir Putin Issues M...,1
5117,7297,nuclear%20reactor,,Finnish Nuclear Plant to Move Ahead After Fina...,0
6119,8735,sinking,,That horrible sinking feeling when youÛªve be...,1
603,872,bioterror,,FedEx no longer to transport bioterror germs i...,1
6334,9055,structural%20failure,,@SirTitan45 Mega mood swing on a 24 hr schedu...,0
731,1057,bleeding,,I've been bleeding in your silence \nI feel sa...,0
2656,3815,detonate,"Brasil,SP",Apollo Brown - 'Detonate' f. M.O.P. | http://t...,0


In [42]:
# check how many rows and columns are in the data set
df.shape

(7613, 5)

In [43]:
# check how many tweets are about disasters 
df[df['target'] == 1].shape

(3271, 5)

In [44]:
# check how many tweets are not about disasters
df[df['target'] == 0].shape

(4342, 5)

In [45]:
# show tweets are about disasters
df[df['target'] == 1]

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [46]:
# show tweets are not about disasters
df[ df["target"] == 0 ]

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0
...,...,...,...,...,...
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0
7582,10834,wrecked,,Cramer: Iger's 3 words that wrecked Disney's s...,0
7584,10837,,,These boxes are ready to explode! Exploding Ki...,0
7587,10841,,,Sirens everywhere!,0


In [47]:
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english'))

include_stopwords = {'dear', 'regards', 'must', 'would', 'also'}

stopwords |= include_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tianennnn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# build a text processing and classifier pipeline
# to predict the location (Canada or UK) of a disaster

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2)

# Create a pipeline that first transforms the text data into TF-IDF vectors, then applies SVM
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=list(stopwords))),
    ('clf', svm.SVC()),
])

# Train the classifier
text_clf.fit(X_train, y_train)

# Predict the test set results
y_pred = text_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Non-Disasters', 'Disasters']))


               precision    recall  f1-score   support

Non-Disasters       0.80      0.90      0.85       889
    Disasters       0.83      0.68      0.75       634

     accuracy                           0.81      1523
    macro avg       0.82      0.79      0.80      1523
 weighted avg       0.81      0.81      0.81      1523



In [49]:
# This script creates a new column 'sentiment' in the dataframe,
# which contains the sentiment score of the text. 
# The sentiment score is a float within the range [-1.0, 1.0], 
# where -1.0 denotes a very negative sentiment, 
# 1.0 denotes a very positive sentiment, 
# and values around 0 denote a neutral sentiment.

from textblob import TextBlob

# Define a function to apply sentiment analysis to a text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity  # returns a value between -1 and 1

# Create a new column 'sentiment' in the DataFrame
df['sentiment'] = df['text'].apply(get_sentiment)


In [50]:
# find average sentiment for disaster and non-disaster tweets
df.groupby('target')['sentiment'].mean()

target
0    0.070622
1    0.018631
Name: sentiment, dtype: float64

In [53]:
# find average sentiment for each location

df.groupby('location')['sentiment'].mean().sort_values(ascending=False)

location
The Waystone Inn               1.0
The Main                       1.0
Morocco                        1.0
Paranaque City                 1.0
Mostly Yuin.                   1.0
                              ... 
fujo garbage heaven           -1.0
Freeport IL. USA              -1.0
Mumbai india                  -1.0
sri lanka                     -1.0
Deployed in the Middle East   -1.0
Name: sentiment, Length: 3341, dtype: float64

In [52]:
# find average sentiment for each keyword
df.groupby('keyword')['sentiment'].mean().sort_values(ascending=False)

keyword
hazardous               0.457891
razed                   0.418946
outbreak                0.312661
mayhem                  0.277262
wreckage                0.273440
                          ...   
trapped                -0.160049
structural%20failure   -0.195099
airplane%20accident    -0.202232
violent%20storm        -0.510888
bloody                 -0.522698
Name: sentiment, Length: 221, dtype: float64