In [1]:
#d!pip install pandas scikit-learn transformers openpyxl tensorflow

import pandas as pd
from transformers import pipeline, TFAutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, AdamW, TFTrainer, TFTrainingArguments
from transformers.convert_graph_to_onnx import convert
import tensorflow as tf

In [2]:
data = pd.read_excel('data.xlsx', engine='openpyxl')

# Assuming the 'label' is in the first column and 'text' is in the second column.
labels = data.iloc[:, 0]  # Get data from the first column
texts = data.iloc[:, 1].str.lower()  # Lowercase all texts from the second column

texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

clf = MultinomialNB()
clf.fit(X_train, labels_train)


In [None]:
preds = clf.predict(X_test)
print("Naive Bayes Model Performance:")
print(classification_report(labels_test, preds))


In [None]:
# For simplicity, we are using the pre-trained model without fine-tuning
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, pipeline

model_name = "distilbert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)
hf_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(texts_train)
clf = MultinomialNB()
clf.fit(X_train, labels_train)

In [7]:
message = "You've won a $1000 prize!"
clf_pred = clf.predict(vectorizer.transform([message]))[0]
hf_pred = hf_pipeline(message)

print(f"Message: {message}")
print(f"Naive Bayes Prediction: {'Spam' if clf_pred else 'Not Spam'}")
print(f"Huggingface Model Prediction: {hf_pred[0]['label']} (Confidence: {hf_pred[0]['score']:.2f})")

Message: You've won a $1000 prize!
Naive Bayes Prediction: Spam
Huggingface Model Prediction: LABEL_1 (Confidence: 0.53)


In [None]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, AdamW, TFTrainer, TFTrainingArguments
from transformers.convert_graph_to_onnx import convert
import tensorflow as tf

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
train_encodings = tokenizer(list(texts_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(texts_test), truncation=True, padding=True, max_length=128)

# Convert encodings to tf datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    list(labels_train)
)).shuffle(1000).batch(32).repeat(2)  # Shuffle and batch the dataset

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    list(labels_test)
)).batch(32)

# Load model and set training arguments
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Fine-tune the model
model.fit(train_dataset, epochs=2, validation_data=test_dataset)

In [None]:
model.save("fine_tuned_model_directory")

In [None]:
loaded_model = tf.keras.models.load_model("fine_tuned_model_directory")

# Spam message to test the model
message = "hello,sir"
encoded_msg = tokenizer.encode_plus(
    message,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="tf"
)

input_data = {
    "input_ids": encoded_msg["input_ids"],
    "attention_mask": encoded_msg["attention_mask"],
}

output = loaded_model.predict(input_data)
logits = output["logits"] if "logits" in output else output[0]
prediction = tf.argmax(logits, axis=1).numpy()[0]

print(f"Message: {message}")
print(f"Loaded Model Prediction: {'Spam' if prediction else 'Not Spam'}")