<a href="https://colab.research.google.com/github/OnyangoOmondie97/sms_text_classification/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
    # %tensorflow_version only exists in Colab.
    !pip install tf-nightly
except Exception:
    pass

import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

print(tf.__version__)


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
def predict_message(pred_text):
    # Loading the data with explicit delimiter '\t'
    train_data = pd.read_csv(train_file_path, sep='\t', names=['label', 'message'])

    # Printing the column names for inspection
    print("Column names in the training data:", train_data.columns)

    # Checking if 'message' column exists in the training data
    if 'message' in train_data.columns:
        message_column = 'message'
    elif 'text' in train_data.columns:
        message_column = 'text'
    else:
        raise KeyError("No column found for text messages in the training data.")

    # Data Preprocessing
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_data[message_column])
    y = train_data['label']

    # Model Building
    model = MultinomialNB()
    model.fit(X, y)

    # Preprocessing the input message
    message_vector = vectorizer.transform([pred_text])

    # Predicting using the trained model
    likeliness_score = model.predict_proba(message_vector)[0][1]
    predicted_class = "spam" if model.predict(message_vector)[0] == 1 else "ham"

    return [likeliness_score, predicted_class]


In [None]:

def test_predictions():
    test_messages = ["how are you doing today",
                     "sale today! to stop texts call 98912460324",
                     "i dont want to go. can we try it a different day? available sat",
                     "our new mobile video service is live. just install on your phone to start watching.",
                     "you have won £1000 cash! call to claim your prize.",
                     "i'll bring it tomorrow. don't forget the milk.",
                     "wow, is your arm alright. that happened to me one time too"
                    ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("You passed the challenge. Great job!")
    else:
        print("You haven't passed yet. Keep trying.")

# Test predictions
test_predictions()