In [1]:
import numpy as np
import pandas as pd 

import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 

import string
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, mean_squared_error, log_loss

import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn import metrics


from time import time

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('C:/Users/USHNISH PAL/Documents/Code/Project/Traffic_detection_nlp/Data/TrainingSet_2_Class.csv', encoding = 'ISO-8859-1')

In [3]:
data["label"].value_counts()

label
1    25550
0    25550
Name: count, dtype: int64

In [4]:
data.shape

(51100, 2)

In [5]:
data.head()

Unnamed: 0,label,text
0,1,Disabled Vehicle on Westbound highway WB at Em...
1,0,New Teacher Lunch &amp; training! Marker wars ...
2,0,And the spot in our #uhaultrends Canadian Des...
3,0,"years ago today #MLK gave his historic ""I Hav..."
4,0,Aww itâs always hard to say goodbye! ð¢ W...


### Text Cleaning

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

stopwords = _stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    doc = (" ".join(text_no_namedentities))

    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    return doc

In [7]:
data['text'] = data['text'].apply(clean)
data.head()

Unnamed: 0,label,text
0,1,disabled vehicle westbound highway emily drive...
1,0,new teacher lunch amp training marker wars w s...
2,0,spot uhaultrends canadian destination cites co...
3,0,years ago today mlk gave historic dream speech
4,0,aww itâs hard say goodbye whatâs favorite ...


In [8]:
docs = list(data['text'])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [9]:
X = docs 
y = data['label']
print(X.shape, y.shape)

(51100, 20000) (51100,)


In [10]:
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40880, 20000) (40880,)
(10220, 20000) (10220,)


In [11]:
mnb = MultinomialNB() 
%time mnb.fit(X_train, y_train)

y_pred_test = mnb.predict(X_test)
y_pred_test_prob = mnb.predict_proba(X_test)

print("\nNative Bayes Accuracy :",accuracy_score(y_test, y_pred_test))
print("Native Bayes MSE :", mean_squared_error(y_test, y_pred_test))
print("Native Bayes Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 29.8 s
Wall time: 33.3 s

Native Bayes Accuracy : 0.9763209393346379
Native Bayes MSE : 0.023679060665362035
Native Bayes Loss : 0.07020694142713657


In [12]:
lr = LogisticRegression(random_state=SEED)
%time lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)
y_pred_test_prob = lr.predict_proba(X_test)

print("\nLogistic Regression Accuracy :",accuracy_score(y_test, y_pred_test))
print("Logistic Regression MSE :", mean_squared_error(y_test, y_pred_test))
print("Logistic Regression Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 2min 29s
Wall time: 1min 30s

Logistic Regression Accuracy : 0.9822896281800392
Logistic Regression MSE : 0.017710371819960862
Logistic Regression Loss : 0.06387740374399828


In [13]:
svm =  LinearSVC(class_weight='balanced') 
%time svm.fit(X_train, y_train)

y_pred_test = svm.predict(X_test)
# y_pred_test_prob = svm.predict_proba(X_test)

print("\nSVM Accuracy :",accuracy_score(y_test, y_pred_test))
print("SVM MSE :", mean_squared_error(y_test, y_pred_test))
# print("SVM Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 5.7 s
Wall time: 27.8 s

SVM Accuracy : 0.9843444227005871
SVM MSE : 0.015655577299412915


In [15]:
rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
%time rf.fit(X_train, y_train)

y_pred_test = rf.predict(X_test)
y_pred_test_prob = rf.predict_proba(X_test)

print("\nRandom Forest Accuracy :",accuracy_score(y_test, y_pred_test))
print("Random Forest MSE :", mean_squared_error(y_test, y_pred_test))
print("Random Forest Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 9min 40s
Wall time: 10min 37s

Random Forest Accuracy : 0.975440313111546
Random Forest MSE : 0.024559686888454012
Random Forest Loss : 0.08194441519391267
