In [1]:
# import libray
import pandas as pd 
from sklearn.feature_extraction import _stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# read data from csv file
data = pd.read_csv('/kaggle/input/dataset300/CleanedSet_2_Class.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0,label,text
0,1,disabled vehicle westbound highway emily drive...
1,0,new teacher lunch amp training marker wars w s...
2,0,spot uhaultrends canadian destination cites co...
3,0,years ago today mlk gave historic dream speech
4,0,aww itÃ¢ÂÂs hard say goodbye whatÃ¢ÂÂs fav...


In [3]:
# count the number of label
data["label"].value_counts()

label
1    25549
0    25549
Name: count, dtype: int64

In [None]:
# data preprocessing using spacy
import spacy
import string

nlp = spacy.load('en_core_web_lg')

stopwords = _stop_words.ENGLISH_STOP_WORDS
lemmatizer = WordNetLemmatizer()

# data cleaning function
def clean(doc):
    text_no_namedentities = []
    document = nlp(doc)
    ents = [e.text for e in document.ents]
    for item in document:
        if item.text in ents:
            pass
        else:
            text_no_namedentities.append(item.text)
    
    doc = (" ".join(text_no_namedentities))
    doc = doc.lower().strip()
    doc = doc.replace("</br>", " ") 
    doc = doc.replace("-", " ") 
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])    
    doc = "".join([lemmatizer.lemmatize(word) for word in doc])
    
    return doc

In [None]:
# cleaning the tweet data
data['text'] = data['text'].apply(clean)
data.head()

In [4]:
# converting the text data to list
docs = list(data['text'])

# converting the text data to vector or array
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_features = 20000) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(docs)
docs = tfidf_vectorizer_vectors.toarray()

In [5]:
# load the data in x and y variable
X = docs 
y = data['label']
print(X.shape, y.shape)

(51098, 20000) (51098,)


In [6]:
# spliting the data in trainig set and test set
SEED=123
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40878, 20000) (40878,)
(10220, 20000) (10220,)


In [8]:
# Native Bayes Model
mnb = MultinomialNB() 
mnb.fit(X_train, y_train)

y_pred_test = mnb.predict(X_test)
y_pred_test_prob = mnb.predict_proba(X_test)

print("\nNative Bayes Accuracy :",accuracy_score(y_test, y_pred_test))
print("\nNative Bayes Loss :", log_loss(y_test, y_pred_test_prob))
print("\nNative Bayes Classification Report :")
print(classification_report(y_test, y_pred_test))


Native Bayes Accuracy : 0.9763209393346379

Native Bayes Loss : 0.06906033821316712

Native Bayes Classification Report :
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      5110
           1       0.97      0.98      0.98      5110

    accuracy                           0.98     10220
   macro avg       0.98      0.98      0.98     10220
weighted avg       0.98      0.98      0.98     10220



In [9]:
# Logistic Regression Model
lr = LogisticRegression(random_state=SEED)
lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)
y_pred_test_prob = lr.predict_proba(X_test)

print("\nLogistic Regression Accuracy :",accuracy_score(y_test, y_pred_test))
print("\nLogistic Regression Loss :", log_loss(y_test, y_pred_test_prob))
print("\nLogistic Regression Classification Report :")
print(classification_report(y_test, y_pred_test))


Logistic Regression Accuracy : 0.9826810176125245

Logistic Regression Loss : 0.06430433817659305

Logistic Regression Classification Report :
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5110
           1       0.99      0.98      0.98      5110

    accuracy                           0.98     10220
   macro avg       0.98      0.98      0.98     10220
weighted avg       0.98      0.98      0.98     10220



In [11]:
# Support Vector Machine Model
svm =  LinearSVC(class_weight='balanced') 
svm.fit(X_train, y_train)

y_pred_test = svm.predict(X_test)
y_pred_test_prob = svm.decision_function (X_test)

print("\nSVM Accuracy :",accuracy_score(y_test, y_pred_test))
print("\nSVM Loss :", log_loss(y_test, y_pred_test_prob))
print("\nSVM Classification Report :")
print(classification_report(y_test, y_pred_test))


SVM Accuracy : 0.9845401174168298

SVM Loss : 0.35193090876693023

SVM Classification Report :
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5110
           1       0.99      0.98      0.98      5110

    accuracy                           0.98     10220
   macro avg       0.98      0.98      0.98     10220
weighted avg       0.98      0.98      0.98     10220



In [12]:
# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
%time rf.fit(X_train, y_train)

y_pred_test = rf.predict(X_test)
y_pred_test_prob = rf.predict_proba(X_test)

print("\nRandom Forest Accuracy :",accuracy_score(y_test, y_pred_test))
print("\nRandom Forest Loss :", log_loss(y_test, y_pred_test_prob))
print("\nRandom Forest Classification Report :")
print(classification_report(y_test, y_pred_test))

CPU times: user 8min 55s, sys: 657 ms, total: 8min 55s
Wall time: 8min 55s

Random Forest Accuracy : 0.9755381604696673

Random Forest Loss : 0.07546175770306339

Random Forest Classification Report :
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      5110
           1       0.96      0.99      0.98      5110

    accuracy                           0.98     10220
   macro avg       0.98      0.98      0.98     10220
weighted avg       0.98      0.98      0.98     10220

