# Importing the training dataset

In [1]:
import pandas as pd
train_data=pd.read_csv('data/train.csv')
print(train_data.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [2]:
train_data=train_data.drop(columns=['keyword','location'])
print(train_data.head())

   id                                               text  target
0   1  Our Deeds are the Reason of this #earthquake M...       1
1   4             Forest fire near La Ronge Sask. Canada       1
2   5  All residents asked to 'shelter in place' are ...       1
3   6  13,000 people receive #wildfires evacuation or...       1
4   7  Just got sent this photo from Ruby #Alaska as ...       1


# Data preprocessing using Lemmatization

In [3]:
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [56]:
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()
corpus_lemmatization_train=[]

for i in range(len(train_data)):
    review=re.sub('[^a-zA-Z]',' ',train_data['text'][i])
    review=review.lower()
    review=review.split()

    review=[wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
    review=' '.join(review)
    corpus_lemmatization_train.append(review)

# TF-IDF model

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus_lemmatization_train).toarray()

In [58]:
y=train_data.target

# Splitting the Data

In [59]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

# Building the RandomForestClassifier

In [60]:
from sklearn.ensemble import RandomForestClassifier
randomclassifier=RandomForestClassifier(n_estimators=200, random_state=0)


In [62]:
model=randomclassifier.fit(X_train,y_train)

In [63]:
y_pred = randomclassifier.predict(X_test)

# Performance metrics

In [64]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(accuracy)
print(f1)
cm=confusion_matrix(y_test, y_pred)
print(cm)

0.7806959947472094
0.7001795332136446
[[799  75]
 [259 390]]


In [65]:
import joblib
joblib.dump(model,'randomforestclassifier.pkl')
joblib.dump(cv,'tfidf.pkl')

['tfidf.pkl']

In [66]:
import joblib
loaded_model = joblib.load('randomforestclassifier.pkl')
loaded_vectorizer = joblib.load('tfidf.pkl')

In [67]:
test_data=pd.read_csv('data/test.csv')
test_data=test_data.drop(columns=['keyword','location'])
print(test_data)


         id                                               text
0         0                 Just happened a terrible car crash
1         2  Heard about #earthquake is different cities, s...
2         3  there is a forest fire at spot pond, geese are...
3         9           Apocalypse lighting. #Spokane #wildfires
4        11      Typhoon Soudelor kills 28 in China and Taiwan
...     ...                                                ...
3258  10861  EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259  10865  Storm in RI worse than last hurricane. My city...
3260  10868  Green Line derailment in Chicago http://t.co/U...
3261  10874  MEG issues Hazardous Weather Outlook (HWO) htt...
3262  10875  #CityofCalgary has activated its Municipal Eme...

[3263 rows x 2 columns]


In [69]:
corpus_lemmatization_test= []
for i in range(len(test_data)):
    review=re.sub('[^a-zA-Z]',' ',test_data['text'][i])
    review=review.lower()
    review=review.split()

    review=[wordnet.lemmatize(word) for word in review if not word in set (stopwords.words('english'))]
    review=' '.join(review)
    corpus_lemmatization_test.append(review)

# important note
- while training the data use fit(), but while testing the data we should use transform()

In [70]:
# X_test_data = cv.fit_transform(corpus_lemmatization).toarray()
X_test_data = loaded_vectorizer.transform(corpus_lemmatization_test).toarray()

print(X_test_data[0])

[0. 0. 0. ... 0. 0. 0.]


In [71]:
X_test_data.shape

(3263, 20679)

In [72]:
X_train.shape

(6090, 20679)

# Predicting for Test data

In [73]:
target=loaded_model.predict(X_test_data)

In [74]:
print(target)

[1 1 1 ... 1 0 0]


In [75]:
len(target)

3263

In [76]:
test_data['target']=target

# Added the ouput values and converted the file to csv

In [82]:
test_data.to_csv('prediction_test_data.csv')