In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your SMS dataset with the specified encoding
data = pd.read_csv("C:\\Users\\HP\\Downloads\\archive (3)\\spam.csv", encoding='latin1')

# Assuming 'v1' contains labels and 'v2' contains SMS messages
# Rename 'v1' and 'v2' columns to 'label' and 'text' for clarity
data.rename(columns={'v1': 'label', 'v2': 'text'}, inplace=True)

# Encode labels as 0 for 'ham' and 1 for 'spam'
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Build and train a Logistic Regression model
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)


Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.98      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [17]:
# Function to predict whether a message is spam or ham
def predict_message(message):
    # Preprocess the input message
    message_tfidf = tfidf_vectorizer.transform([message])
    
    # Make the prediction
    prediction = classifier.predict(message_tfidf)[0]
    
    # Convert prediction to human-readable label
    label = 'spam' if prediction == 1 else 'ham'
    
    return label

# Example usage:
input_message = "Free iPhone now! Click to claim."
predicted_label = predict_message(input_message)
print(f"The message is predicted as: {predicted_label}")


The message is predicted as: spam


In [19]:
# Example usage:
input_message = "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
predicted_label = predict_message(input_message)
print(f"The message is predicted as: {predicted_label}")


The message is predicted as: ham
