In [None]:
from google.colab import drive
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import random
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
drive.mount(('/content/drive'))
path = '/content/drive/MyDrive/Dataset Research Method/clickbait.csv'
df = pd.read_csv(path)

Mounted at /content/drive


In [None]:
nltk.download("stopwords")
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df['title_without_stopwords'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in(stop)]))

In [None]:
le = LabelEncoder()
le.fit(df['label'])
df['label_encoded'] = le.transform(df['label'])

In [None]:
title = df.title_without_stopwords
y = df.label_encoded

In [None]:
bow = CountVectorizer()
x_bow = bow.fit_transform(title)
x_bow.shape
x_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(title)
x_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
x_bow_train, x_bow_test, y_bow_train, y_bow_test = train_test_split(x_bow, y, test_size = 0.2, random_state = 42)
x_bow_test, x_bow_val, y_bow_test, y_bow_val = train_test_split(x_bow_test, y_bow_test, test_size = 0.5, random_state = 42)
x_tfidf_train, x_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(x_tfidf, y, test_size = 0.2, random_state = 42)
x_tfidf_test, x_tfidf_val, y_tfidf_test, y_tfidf_val = train_test_split(x_tfidf_test, y_tfidf_test, test_size = 0.5, random_state = 42)

In [None]:
clf = BernoulliNB()

In [None]:

clf.fit(x_bow_train, y_bow_train)
y_bow_val_pred = clf.predict(x_bow_val)
accuracy_bow_val = accuracy_score(y_bow_val, y_bow_val_pred)
precision_bow_val = precision_score(y_bow_val, y_bow_val_pred)
recall_bow_val = recall_score(y_bow_val, y_bow_val_pred)
f1_bow_val = f1_score(y_bow_val, y_bow_val_pred)
print("BoW Val")
print("Accuracy:", accuracy_bow_val)
print("Precision:", precision_bow_val)
print("Recall:", recall_bow_val)
print("F1 score:", f1_bow_val)
y_bow_test_pred = clf.predict(x_bow_test)
accuracy_bow_test = accuracy_score(y_bow_test, y_bow_test_pred)
precision_bow_test = precision_score(y_bow_test, y_bow_test_pred)
recall_bow_test = recall_score(y_bow_test, y_bow_test_pred)
f1_bow_test = f1_score(y_bow_test, y_bow_test_pred)
print("BoW Test")
print("Accuracy:", accuracy_bow_test)
print("Precision:", precision_bow_test)
print("Recall:", recall_bow_test)
print("F1 score:", f1_bow_test)

BoW Val
Accuracy: 0.9818693341669271
Precision: 0.9749235474006116
Recall: 0.9894475481067659
F1 score: 0.982131854590265
BoW Test
Accuracy: 0.9812441387933729
Precision: 0.9712713936430318
Recall: 0.9918851435705368
F1 score: 0.9814700432365657


In [None]:
cm_bow = confusion_matrix(y_bow_test, y_bow_test_pred)
plt.imshow(cm_bow, cmap = plt.cm.gray)
plt.title("Confusion Matrix for BoW Bernoulli")
plt.colorbar()
plt.xlabel("Prediction Data")
plt.xlabel("Test Data")
print(cm_bow)

In [None]:
target = ('0', '1')
print(classification_report(y_bow_test, y_bow_test_pred, target_names=target, digits = 5))

In [None]:

clf.fit(x_tfidf_train, y_tfidf_train)
y_tfidf_val_pred = clf.predict(x_tfidf_val)
accuracy_tfidf_val = accuracy_score(y_tfidf_val, y_tfidf_val_pred)
precision_tfidf_val = precision_score(y_tfidf_val, y_tfidf_val_pred)
recall_tfidf_val = recall_score(y_tfidf_val, y_tfidf_val_pred)
f1_tfidf_val = f1_score(y_tfidf_val, y_tfidf_val_pred)
print("TFIDF Val")
print("Accuracy:", accuracy_tfidf_val)
print("Precision:", precision_tfidf_val)
print("Recall:", recall_tfidf_val)
print("F1 score:", f1_tfidf_val)
y_tfidf_test_pred = clf.predict(x_tfidf_test)
accuracy_tfidf_test = accuracy_score(y_tfidf_test, y_tfidf_test_pred)
precision_tfidf_test = precision_score(y_tfidf_test, y_tfidf_test_pred)
recall_tfidf_test = recall_score(y_tfidf_test, y_tfidf_test_pred)
f1_tfidf_test = f1_score(y_tfidf_test, y_tfidf_test_pred)
print("TFIDF Val")
print("Accuracy:", accuracy_tfidf_test)
print("Precision:", precision_tfidf_test)
print("Recall:", recall_tfidf_test)
print("F1 score:", f1_tfidf_test)

In [None]:
cm_tfidf = confusion_matrix(y_tfidf_test, y_tfidf_test_pred)
plt.imshow(cm_tfidf, cmap = plt.cm.gray)
plt.title("Confusion Matrix for TFIDF Bernoulli")
plt.colorbar()
plt.xlabel("Prediction Data")
plt.xlabel("Test Data")
print(cm_tfidf)

In [None]:
target = ('0', '1')
print(classification_report(y_bow_test, y_bow_test_pred, target_names=target, digits = 5))

In [None]:
clf2 = MultinomialNB()

In [None]:

clf2.fit(x_bow_train, y_bow_train)
y_bow_val_pred = clf2.predict(x_bow_val)
accuracy_bow_val = accuracy_score(y_bow_val, y_bow_val_pred)
precision_bow_val = precision_score(y_bow_val, y_bow_val_pred)
recall_bow_val = recall_score(y_bow_val, y_bow_val_pred)
f1_bow_val = f1_score(y_bow_val, y_bow_val_pred)
print("BoW Val")
print("Accuracy:", accuracy_bow_val)
print("Precision:", precision_bow_val)
print("Recall:", recall_bow_val)
print("F1 score:", f1_bow_val)
y_bow_test_pred = clf2.predict(x_bow_test)
accuracy_bow_test = accuracy_score(y_bow_test, y_bow_test_pred)
precision_bow_test = precision_score(y_bow_test, y_bow_test_pred)
recall_bow_test = recall_score(y_bow_test, y_bow_test_pred)
f1_bow_test = f1_score(y_bow_test, y_bow_test_pred)
print("BoW Test")
print("Accuracy:", accuracy_bow_test)
print("Precision:", precision_bow_test)
print("Recall:", recall_bow_test)
print("F1 score:", f1_bow_test)