In [1]:
import pandas as pd 

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [3]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

# Feature Engineering

In [4]:
import string
from nltk.corpus import stopwords 

In [5]:
def text_process(mess):
    
    nopunc = [char for char in mess if char not in string.punctuation]
    
    nopunc= ''.join(nopunc)
    
    return  nopunc.split()

In [6]:
train['text'].head(5).apply(text_process)

0    [Our, Deeds, are, the, Reason, of, this, earth...
1        [Forest, fire, near, La, Ronge, Sask, Canada]
2    [All, residents, asked, to, shelter, in, place...
3    [13000, people, receive, wildfires, evacuation...
4    [Just, got, sent, this, photo, from, Ruby, Ala...
Name: text, dtype: object

In [7]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(train['text'])

In [11]:
twitt_bow = bow_transformer.transform(train['text'])

In [12]:
twitt_bow.shape

(7613, 26817)

# Creating the Model - MultinomialNB

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
tf_transformer = TfidfTransformer().fit(twitt_bow)

In [15]:
twitt_tfidf = tf_transformer.transform(twitt_bow)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [17]:
X = twitt_tfidf
y= train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=200)

In [18]:
twitter_harm_detector = MultinomialNB() 
twitter_harm_detector.fit(X_train, y_train)

MultinomialNB()

In [19]:
pred = twitter_harm_detector.predict(X_test)

In [20]:
from sklearn.metrics import classification_report,confusion_matrix
print (classification_report(y_test,pred))
print (confusion_matrix(y_test,pred))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83      1303
           1       0.86      0.59      0.70       981

    accuracy                           0.78      2284
   macro avg       0.81      0.76      0.76      2284
weighted avg       0.80      0.78      0.77      2284

[[1210   93]
 [ 403  578]]


# Creating the Model - Support Vector Classifier 

In [21]:
from sklearn.svm import SVC

In [22]:
svc_model = SVC()

In [23]:
X = twitt_tfidf
y= train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=200)

In [24]:
svc_model.fit(X_train,y_train)

SVC()

In [25]:
X.shape

(7613, 26817)

In [26]:
predict = svc_model.predict(X_test)

In [27]:
print (classification_report(y_test,predict))
print (confusion_matrix(y_test,predict))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83      1303
           1       0.87      0.60      0.71       981

    accuracy                           0.79      2284
   macro avg       0.81      0.76      0.77      2284
weighted avg       0.80      0.79      0.78      2284

[[1212   91]
 [ 395  586]]


After reviewing the 2 models, i decided to use the SVC model, as he preformed slightly better than MultinomialNB. But especially becuase it managed to identify more true positive cases and more False Positive 

In [28]:
test = pd.read_csv('test.csv')

In [29]:
test_data_CV = bow_transformer.transform(test['text'])
tf_test = TfidfTransformer().fit(test_data_CV)
test_tfidf = tf_test.transform(test_data_CV)

In [30]:
pred_test = svc_model.predict(test_tfidf)

In [31]:
import pickle
filename = 'twitt_SVC_detector_model.sav'
pickle.dump(svc_model, open(filename, 'wb'))