In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder


In [7]:
data = pd.read_csv('dataset.tsv', sep='\t')

# Drop rows with missing values in 'text' and 'Pattern Category' columns
data.dropna(subset=['text', 'Pattern Category'], inplace=True)

# Split data into features (x), labels (y), and additional variable (z)
x = data['text']
z = data['Pattern Category']
y = data['label']

label_encoder = LabelEncoder()

# Fit LabelEncoder on the combined set of string labels
label_encoder.fit(z)

# Transform string labels to numerical labels
z_encoded = label_encoder.transform(z)
# Split data into training and testing subsets

X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(x, y, z_encoded, test_size=0.2, random_state=42)



In [8]:
#(dataset)
modelRandomForest = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=50, criterion="entropy", max_features=None, random_state=42))

modelRandomForest.fit(X_train, z_train)

predictions = modelRandomForest.predict(X_test)

accuracy = accuracy_score(z_test, predictions)
print("Accuracy du modèle:", accuracy)

Accuracy du modèle: 0.902542372881356


In [12]:
# Tests 
sentences = ["Or fastest delivery Wednesday, April 17. Order within 8 hrs 30 mins",
             "Limited time deal",
             "Bundle List Price:	$299.98 Details",
             "Deal Price:	$199.98",
             "You Save:	$100.00 (33%)",
             "100% Polyester",
             "REGULAR FIT: Comfortable, easy fit through the shoulders, chest, and waist",
             "Standard $299.00/mo"]
category_vectorized = modelRandomForest.predict(sentences)
category_decoded = label_encoder.inverse_transform(z)
print(category_decoded)


['Not Dark Pattern' 'Urgency' 'Not Dark Pattern' 'Not Dark Pattern'
 'Misdirection' 'Not Dark Pattern' 'Not Dark Pattern' 'Urgency']
