In [None]:
import kagglehub
import pandas as pd
import numpy as np
# Download latest version
path = kagglehub.dataset_download("mariumfaheem666/spam-sms-classification-using-nlp")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'spam-sms-classification-using-nlp' dataset.
Path to dataset files: /kaggle/input/spam-sms-classification-using-nlp


## Load the dataset

### Subtask:
Load the dataset from the specified path.

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import os

# Download latest version
path = kagglehub.dataset_download("mariumfaheem666/spam-sms-classification-using-nlp")

print("Path to dataset files:", path)

# List files in the downloaded directory to find the correct filename
print(os.listdir(path))

file_path = os.path.join(path, "Spam_SMS.csv")
df = pd.read_csv(file_path)

# Rename columns for clarity
df.rename(columns={'Class': 'label', 'Message': 'sentence'}, inplace=True)

display(df.head())

Using Colab cache for faster access to the 'spam-sms-classification-using-nlp' dataset.
Path to dataset files: /kaggle/input/spam-sms-classification-using-nlp
['Spam_SMS.csv']


Unnamed: 0,label,sentence
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing and Splitting

### Subtask:
Preprocess the text data and split the data into training and testing sets.

## Model Training

### Subtask:
Define and train a BERT model for SMS spam classification.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import tensorflow as tf

# Encode labels to integers
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df['sentence'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

print("Train set size:", len(X_train))
print("Test set size:", len(X_test))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(
    X_train,
    truncation=True,
    padding=True,
    max_length=128, # Reduced max_length for efficiency
    return_tensors='tf'
)

test_encodings = tokenizer(
    X_test,
    truncation=True,
    padding=True,
    max_length=128, # Reduced max_length for efficiency
    return_tensors='tf'
)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(32)

print("Train dataset batch size:", next(iter(train_dataset))[0]['input_ids'].shape)
print("Test dataset batch size:", next(iter(test_dataset))[0]['input_ids'].shape)

Train set size: 4459
Test set size: 1115
Train dataset batch size: (32, 128)
Test dataset batch size: (32, 128)


In [29]:
from transformers import TFBertForSequenceClassification, create_optimizer
# from tensorflow.keras.callbacks import EarlyStopping # Removed EarlyStopping
import tensorflow as tf # Import tensorflow

# Define the BERT model
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2, # Binary classification
    use_safetensors=False # Explicitly set use_safetensors to False
)

# Define optimizer
# Use create_optimizer from transformers.optimization
num_train_steps = len(train_dataset) * 10 # Assuming 10 epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0, # You can adjust warmup steps
    num_train_steps=num_train_steps
)


# Compile the model
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Define Early Stopping callback
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # Removed EarlyStopping

# Train the model
history = model.fit(
    train_dataset,
    epochs=10, # You can adjust the number of epochs
    # batch_size=32, # Batch size is handled by the dataset
    validation_data=test_dataset,
    # callbacks=[early_stopping] # Removed EarlyStopping
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Task
Create a complete pipeline for training a BERT model for text classification, including data loading from "IMDB Dataset.csv", preprocessing, model definition, training, and evaluation.

## Saving the Model and Tokenizer

### Subtask:
Save the trained BERT model and tokenizer to a local directory for offline use.

In [31]:
import os

# Define the directory to save the model and tokenizer
save_directory = "./spam_classification_model"

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to: {save_directory}")

Model and tokenizer saved to: ./spam_classification_model


## Data loading

### Subtask:
Load the dataset from the specified path into a pandas DataFrame.


## Filtering Dependencies

### Subtask:
Refine the requirements file to include only relevant deep learning and Streamlit dependencies.

In [33]:
def filter_requirements(requirements_file="requirements.txt"):
    """Reads a requirements file and filters out irrelevant packages."""
    relevant_packages = [
        "streamlit",
        "transformers",
        "tensorflow",
        "numpy",
        "pandas",
        "scikit-learn",
        "kagglehub" # Include kagglehub if you need to download the dataset within the Streamlit app
        # Add other relevant packages if necessary
    ]
    filtered_lines = []
    try:
        with open(requirements_file, "r") as f:
            for line in f:
                # Check if the line starts with any of the relevant package names
                if any(line.strip().lower().startswith(pkg.lower()) for pkg in relevant_packages):
                    filtered_lines.append(line)
    except FileNotFoundError:
        print(f"Error: {requirements_file} not found.")
        return

    # Write the filtered dependencies back to the file
    with open(requirements_file, "w") as f:
        f.writelines(filtered_lines)

    print(f"Filtered dependencies saved to {requirements_file}")

filter_requirements()

Filtered dependencies saved to requirements.txt
