In [2]:
import pandas as pd 
from sklearn.feature_extraction import _stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss
from sklearn.model_selection import train_test_split

In [7]:
data = pd.read_csv('C:/Users/USHNISH PAL/Documents/Code/Project/Traffic_detection_nlp/Data/TrainingSet_3_Class.csv', encoding = 'ISO-8859-1')

In [8]:
data["label"].value_counts()

label
0    25600
1    17479
2     8099
Name: count, dtype: int64

In [9]:
data.shape

(51178, 2)

In [11]:
data.head()

Unnamed: 0,label,text
0,2,UPDATE: Rt. in York County has reopened near C...
1,0,#Fargo #ND #USA - Sr. Clinical Research Associ...
2,0,Just two of the biggest fans riding the ride i...
3,1,Wreck on Southbound just north of Atlanta High...
4,0,Justin bieber's new album is going to be amazi...


In [13]:
import spacy
nlp = spacy.load('en_core_web_lg')

stopwords = _stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    doc = (" ".join(text_no_namedentities))

    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    return doc

In [14]:
data['text'] = data['text'].apply(clean)
data.head()

Unnamed: 0,label,text
0,2,update rt york county reopened near commerce c...
1,0,fargo nd usa sr clinical research associate sr...
2,0,just biggest fans riding ride hi ðð »
3,1,wreck just north atlanta highway exit causing ...
4,0,justin bieber s new album going amazing ð


In [15]:
docs = list(data['text'])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [16]:
X = docs 
y = data['label']
print(X.shape, y.shape)

(51178, 20000) (51178,)


In [17]:
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40942, 20000) (40942,)
(10236, 20000) (10236,)


In [18]:
mnb = MultinomialNB() 
%time mnb.fit(X_train, y_train)

y_pred_test = mnb.predict(X_test)
y_pred_test_prob = mnb.predict_proba(X_test)

print("\nNative Bayes Accuracy :",accuracy_score(y_test, y_pred_test))
print("Native Bayes MSE :", mean_squared_error(y_test, y_pred_test))
print("Native Bayes Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 31.3 s
Wall time: 22.2 s

Native Bayes Accuracy : 0.8772958186791715
Native Bayes MSE : 0.13735834310277453
Native Bayes Loss : 0.27309121937971476


In [19]:
lr = LogisticRegression(random_state=SEED)
%time lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)
y_pred_test_prob = lr.predict_proba(X_test)

print("\nLogistic Regression Accuracy :",accuracy_score(y_test, y_pred_test))
print("Logistic Regression MSE :", mean_squared_error(y_test, y_pred_test))
print("Logistic Regression Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 41min 32s
Wall time: 9min 9s

Logistic Regression Accuracy : 0.9706916764361079
Logistic Regression MSE : 0.045427901524032824
Logistic Regression Loss : 0.10612861128457544


In [22]:
svm =  LinearSVC(class_weight='balanced') 
%time svm.fit(X_train, y_train)

y_pred_test = svm.predict(X_test)
# y_pred_test_prob = svm.predict_proba(X_test)

print("\nSVM Accuracy :",accuracy_score(y_test, y_pred_test))
print("SVM MSE :", mean_squared_error(y_test, y_pred_test))
# print("SVM Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 9.36 s
Wall time: 27.7 s

SVM Accuracy : 0.9720593982024228
SVM MSE : 0.042887846815162174


In [21]:
rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
%time rf.fit(X_train, y_train)

y_pred_test = rf.predict(X_test)
y_pred_test_prob = rf.predict_proba(X_test)

print("\nRandom Forest Accuracy :",accuracy_score(y_test, y_pred_test))
print("Random Forest MSE :", mean_squared_error(y_test, y_pred_test))
print("Random Forest Loss :", log_loss(y_test, y_pred_test_prob))

CPU times: total: 11min 27s
Wall time: 12min 46s

Random Forest Accuracy : 0.9586752637749121
Random Forest MSE : 0.05861664712778429
Random Forest Loss : 0.18665190858754624
