In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('/content/dataset.tsv',sep='\t')

In [None]:
df.head(5)

Unnamed: 0,page_id,text,label,Pattern Category
0,1012,FLASH SALE | LIMITED TIME ONLY Shop Now,1,Urgency
1,158,Pillowcases & Shams,0,Not Dark Pattern
2,108,Write a review,0,Not Dark Pattern
3,1425,"To start your return, simply click on the foll...",0,Not Dark Pattern
4,1658,newsletter signup (privacy policy),0,Not Dark Pattern


In [None]:
df.shape

(2356, 4)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    df.text,
    df.label,
    test_size=0.2,
    random_state=2022,
    stratify=df.label
)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import BernoulliNB


In [None]:
param_grid = {
    'tfidfvectorizer__binary': [True, False],  # Whether to use binary or term-frequency features
    'bernoullinb__alpha': np.logspace(-4,5,1200)  # Range of alpha values to explore
}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [None]:
model = make_pipeline(TfidfVectorizer(), BernoulliNB())

In [None]:
# multinomial naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('Multi NB',MultinomialNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90       236
           1       0.87      0.97      0.91       236

    accuracy                           0.91       472
   macro avg       0.91      0.91      0.91       472
weighted avg       0.91      0.91      0.91       472



In [None]:
# bernoulli naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('bernoulli NB',BernoulliNB(alpha=0.122527063642198))

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       236
           1       0.96      0.96      0.96       236

    accuracy                           0.96       472
   macro avg       0.96      0.96      0.96       472
weighted avg       0.96      0.96      0.96       472



In [None]:
import pickle
with open('Bern096.pickle','wb') as f:
  pickle.dump(clf,f)

In [None]:
with open('Bern096.pickle','rb') as f:
  model=pickle.load(f)

In [None]:
model.predict(["hello my name is virat kohli"])[0]

0

In [None]:
model.predict(["hurry up only few items left"])[0]

1

In [None]:
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train, y_train)
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'tfidfvectorizer__binary': True, 'bernoullinb__alpha': 1.0}


In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)


Best Hyperparameters: {'bernoullinb__alpha': 0.7340428868213008, 'tfidfvectorizer__binary': True}


In [None]:
test_accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9533898305084746


In [None]:
test_accuracy = random_search.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9491525423728814


In [None]:
import pickle
with open('BNBTFIDF.pickle','wb') as f:
  pickle.dump(clf,f)

In [None]:
with open('BNBTFIDF.pickle','rb') as f:
  model=pickle.load(f)

In [None]:
model.predict(["hello my name is virat kohli"])[0]

0

In [None]:
model.predict(["hurry up only few items left"])[0]

1

In [None]:
from sklearn.naive_bayes import GaussianNB
vectorizer =TfidfVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
gnb = GaussianNB()
gnb.fit(X_train_count.toarray(), y_train)
X_test_count = vectorizer.transform(X_test)
y_pred = gnb.predict(X_test_count.toarray())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.65      0.76       236
           1       0.73      0.94      0.82       236

    accuracy                           0.79       472
   macro avg       0.82      0.79      0.79       472
weighted avg       0.82      0.79      0.79       472



In [None]:
import spacy
nlp=spacy.load("en_core_web_sm")
def preprocess(text):
    doc=nlp(text)
    filtered_token=[]
    for token in doc:
      if token.is_stop or token.is_punct:
        continue
      filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [None]:
df['pre_text']=df.text.apply(preprocess)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(
    df.pre_text,
    df.label,
    test_size=0.2,
    random_state=2022,
    stratify=df.label
)


In [None]:
# multinomial naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('Multi NB',MultinomialNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.83      0.90       236
           1       0.85      0.97      0.91       236

    accuracy                           0.90       472
   macro avg       0.91      0.90      0.90       472
weighted avg       0.91      0.90      0.90       472



In [None]:
# bernoulli naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('bernoulli NB',BernoulliNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.94       236
           1       0.92      0.95      0.94       236

    accuracy                           0.94       472
   macro avg       0.94      0.94      0.94       472
weighted avg       0.94      0.94      0.94       472



In [None]:
from sklearn.naive_bayes import GaussianNB
vectorizer =TfidfVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
gnb = GaussianNB()
gnb.fit(X_train_count.toarray(), y_train)
X_test_count = vectorizer.transform(X_test)
y_pred = gnb.predict(X_test_count.toarray())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.60      0.71       236
           1       0.69      0.92      0.79       236

    accuracy                           0.76       472
   macro avg       0.79      0.76      0.75       472
weighted avg       0.79      0.76      0.75       472



In [None]:
import pandas as pd
df=pd.read_csv('/content/aug_data.csv')

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,page_id,text,label,Pattern Category
0,0,1012.0,FLASH SALE | LIMITED TIME ONLY Shop Now,1,Urgency
1,1,158.0,Pillowcases & Shams,0,Not Dark Pattern
2,2,108.0,Write a review,0,Not Dark Pattern
3,3,1425.0,"To start your return, simply click on the foll...",0,Not Dark Pattern
4,4,1658.0,newsletter signup (privacy policy),0,Not Dark Pattern


In [None]:
df.shape

(4712, 5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    df.text,
    df.label,
    test_size=0.2,
    random_state=2022,
    stratify=df.label
)

In [None]:
import pandas as pd

# Assuming your data is stored in a DataFrame
df = pd.concat([X_train, y_train], axis=1)
df = df.dropna()

X_train = df.drop('label', axis=1)
y_train = df['label']


In [None]:
df.shape

(3768, 2)

In [None]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    df.text,
    df.label,
    test_size=0.2,
    random_state=2022,
    stratify=df.label
)

In [None]:
# multinomial naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('Multi NB',MultinomialNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.88      0.92       377
           1       0.89      0.97      0.93       377

    accuracy                           0.93       754
   macro avg       0.93      0.93      0.93       754
weighted avg       0.93      0.93      0.93       754



In [None]:
# bernoulli naive bayes using TF-IDF without preprocessing and augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('bernoulli NB',BernoulliNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       377
           1       0.96      0.94      0.95       377

    accuracy                           0.95       754
   macro avg       0.95      0.95      0.95       754
weighted avg       0.95      0.95      0.95       754



In [None]:
from sklearn.naive_bayes import GaussianNB
vectorizer =TfidfVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
gnb = GaussianNB()
gnb.fit(X_train_count.toarray(), y_train)
X_test_count = vectorizer.transform(X_test)
y_pred = gnb.predict(X_test_count.toarray())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.64      0.75       377
           1       0.72      0.92      0.81       377

    accuracy                           0.78       754
   macro avg       0.80      0.78      0.78       754
weighted avg       0.80      0.78      0.78       754



In [None]:
import spacy
nlp=spacy.load("en_core_web_sm")
def preprocess(text):
    doc=nlp(text)
    filtered_token=[]
    for token in doc:
      if token.is_stop or token.is_punct:
        continue
      filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [None]:
df['pre_text']=df.text.apply(preprocess)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(
    df.pre_text,
    df.label,
    test_size=0.2,
    random_state=2022,
    stratify=df.label
)


In [None]:
# multinomial naive bayes using TF-IDF with preprocessing and with augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('Multi NB',MultinomialNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91       377
           1       0.89      0.94      0.91       377

    accuracy                           0.91       754
   macro avg       0.91      0.91      0.91       754
weighted avg       0.91      0.91      0.91       754



In [None]:
# bernoulli naive bayes using TF-IDF with preprocessing and with augmentation
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report

clf=Pipeline([
    ('vectorizer_bow',TfidfVectorizer()),
    ('bernoulli NB',BernoulliNB())

])
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       377
           1       0.93      0.91      0.92       377

    accuracy                           0.92       754
   macro avg       0.92      0.92      0.92       754
weighted avg       0.92      0.92      0.92       754



In [None]:
from sklearn.naive_bayes import GaussianNB
vectorizer =TfidfVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
gnb = GaussianNB()
gnb.fit(X_train_count.toarray(), y_train)
X_test_count = vectorizer.transform(X_test)
y_pred = gnb.predict(X_test_count.toarray())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.59      0.70       377
           1       0.69      0.90      0.78       377

    accuracy                           0.74       754
   macro avg       0.77      0.74      0.74       754
weighted avg       0.77      0.74      0.74       754

