In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
print("There are {} observations and {} features in this dataset".format(data.shape[0],data.shape[1]))

There are 7613 observations and 5 features in this dataset


In [4]:
data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
#removing all the columns with null values
dataset = data[['id','text','target']]
dataset.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
dataset.isna().sum()

id        0
text      0
target    0
dtype: int64

In [7]:
dataset.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# removing irrelavant characters
import re

def clean_non_alphanumeric(text):
    return re.sub('[^a-zA-Z]',' ',text)

dataset['text'] = dataset['text'].apply(clean_non_alphanumeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
dataset.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this earthquake M...,1
1,4,Forest fire near La Ronge Sask Canada,1
2,5,All residents asked to shelter in place are ...,1
3,6,people receive wildfires evacuation or...,1
4,7,Just got sent this photo from Ruby Alaska as ...,1


In [10]:
# Converting text to lowercase

def clean_lowercase(text):
    return str(text).lower()

dataset['text'] = dataset['text'].apply(clean_lowercase)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
dataset.head()

Unnamed: 0,id,text,target
0,1,our deeds are the reason of this earthquake m...,1
1,4,forest fire near la ronge sask canada,1
2,5,all residents asked to shelter in place are ...,1
3,6,people receive wildfires evacuation or...,1
4,7,just got sent this photo from ruby alaska as ...,1


In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
# using nltk library for tokenization

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def clean_tokenization(text):
    return word_tokenize(text)

dataset['text'] = dataset['text'].apply(clean_tokenization)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [14]:
dataset.head(5)

Unnamed: 0,id,text,target
0,1,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,"[all, residents, asked, to, shelter, in, place...",1
3,6,"[people, receive, wildfires, evacuation, order...",1
4,7,"[just, got, sent, this, photo, from, ruby, ala...",1


In [15]:
# Removing stop words

from nltk.corpus import stopwords

# Let's look at the stop words in english
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [16]:
stop_words = set(stopwords.words('english'))
def clean_stopwords(token):
    return [item for item in token if item not in stop_words]

dataset['text'] = dataset['text'].apply(clean_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
dataset.head()

Unnamed: 0,id,text,target
0,1,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,4,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,"[residents, asked, shelter, place, notified, o...",1
3,6,"[people, receive, wildfires, evacuation, order...",1
4,7,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


In [18]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemma = WordNetLemmatizer()

def clean_lemmatization(token):
    return [lemma.lemmatize(word=w,pos='v') for w in token]

dataset['text'] = dataset['text'].apply(clean_lemmatization)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
# Remove all the words having length less than 2
def clean_length(token):
    return [i for i in token if len(i)>2]

dataset['text'] = dataset['text'].apply(clean_length)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [20]:
# Convert the list back to the string 

def convert_to_string(text):
    return ' '.join(text)

dataset['text'] = dataset['text'].apply(convert_to_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
dataset.head()

Unnamed: 0,id,text,target
0,1,deeds reason earthquake may allah forgive,1
1,4,forest fire near ronge sask canada,1
2,5,residents ask shelter place notify officer eva...,1
3,6,people receive wildfires evacuation order cali...,1
4,7,get send photo ruby alaska smoke wildfires pou...,1


In [22]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
dataset['text'] = dataset['text'].apply(lambda x : remove_emoji(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [23]:
dataset.head()

Unnamed: 0,id,text,target
0,1,deeds reason earthquake may allah forgive,1
1,4,forest fire near ronge sask canada,1
2,5,residents ask shelter place notify officer eva...,1
3,6,people receive wildfires evacuation order cali...,1
4,7,get send photo ruby alaska smoke wildfires pou...,1


In [24]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [25]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [26]:
#removing all the columns with null values
test = test[['id','text']]
test.dropna(inplace=True)

In [27]:
test.isnull().sum()

id      0
text    0
dtype: int64

In [28]:
test.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [29]:
# removing irrelavant characters
import re

def clean_non_alphanumeric(text):
    return re.sub('[^a-zA-Z]',' ',text)

test['text'] = test['text'].apply(clean_non_alphanumeric)

In [30]:
# Converting text to lowercase

def clean_lowercase(text):
    return str(text).lower()

test['text'] = test['text'].apply(clean_lowercase)

In [31]:
# using nltk library for tokenization

import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize

def clean_tokenization(text):
    return word_tokenize(text)

test['text'] = test['text'].apply(clean_tokenization)

In [32]:
stop_words = set(stopwords.words('english'))
def clean_stopwords(token):
    return [item for item in token if item not in stop_words]

test['text'] = test['text'].apply(clean_stopwords)

In [33]:
test.head()

Unnamed: 0,id,text
0,0,"[happened, terrible, car, crash]"
1,2,"[heard, earthquake, different, cities, stay, s..."
2,3,"[forest, fire, spot, pond, geese, fleeing, acr..."
3,9,"[apocalypse, lighting, spokane, wildfires]"
4,11,"[typhoon, soudelor, kills, china, taiwan]"


In [34]:
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
lemma = WordNetLemmatizer()

def clean_lemmatization(token):
    return [lemma.lemmatize(word=w,pos='v') for w in token]

test['text'] = test['text'].apply(clean_lemmatization)

In [35]:
test.head()

Unnamed: 0,id,text
0,0,"[happen, terrible, car, crash]"
1,2,"[hear, earthquake, different, cities, stay, sa..."
2,3,"[forest, fire, spot, pond, geese, flee, across..."
3,9,"[apocalypse, light, spokane, wildfires]"
4,11,"[typhoon, soudelor, kill, china, taiwan]"


In [36]:
# Remove all the words having length less than 2
def clean_length(token):
    return [i for i in token if len(i)>2]

test['text'] = test['text'].apply(clean_length)

In [37]:
# Convert the list back to the string 

def convert_to_string(text):
    return ' '.join(text)

test['text'] = test['text'].apply(convert_to_string)

In [38]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
test['text'] = test['text'].apply(lambda x : remove_emoji(x))

In [39]:
test.head()

Unnamed: 0,id,text
0,0,happen terrible car crash
1,2,hear earthquake different cities stay safe eve...
2,3,forest fire spot pond geese flee across street...
3,9,apocalypse light spokane wildfires
4,11,typhoon soudelor kill china taiwan


In [41]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [43]:
train_x = dataset['text']
train_y = dataset['target']

In [44]:
# vectorizing the training data text
t = time()  # not compulsory

# loading CountVectorizer
tf_vectorizer = CountVectorizer() # or term frequency

X_train_tf = tf_vectorizer.fit_transform(train_x)

duration = time() - t
print("Time taken to extract features from training data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

Time taken to extract features from training data : 0.237631 seconds
n_samples: 7613, n_features: 19416


In [47]:
test_x = test['text']

# vectorizing the test data text
t = time()
X_test_tf = tf_vectorizer.transform(test_x)

duration = time() - t
print("Time taken to extract features from test data : %f seconds" % (duration))
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

Time taken to extract features from test data : 0.099677 seconds
n_samples: 3263, n_features: 19416


In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_train_tf, train_y, test_size = 0.20)

In [49]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(x_train, y_train)

SVC(kernel='linear')

In [50]:
y_predict = classifier.predict(x_test)

In [51]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,y_predict))
print(classification_report(y_test,y_predict))

[[728 126]
 [221 448]]
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       854
           1       0.78      0.67      0.72       669

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



In [52]:
prediction = classifier.predict(X_test_tf)
print(prediction)

[1 1 1 ... 1 1 0]


In [53]:
submission=pd.DataFrame()
submission['id']=test['id']
submission['target'] = prediction
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [54]:
submission.to_csv('C:/Users/nitin/OneDrive/Desktop/Projects/NLP/Disaster_tweet/submission.csv', index=False)