In [1]:
# Save this code as train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle

# Load your CSV data
df = pd.read_csv('spam.csv', encoding='latin-1')

# Check the first few rows to understand the structure
print(df.head())

# Rename columns for easier access
df = df.rename(columns={df.columns[0]: 'label', df.columns[1]: 'message'})

# Map labels to numeric values (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Prepare features (X) and target (y)
X = df['message']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer to convert text into numerical data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Logistic Regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_tfidf, y_train)

# Save the model and vectorizer
with open('log_reg_model.pkl', 'wb') as f:  # Save Logistic Regression model
    pickle.dump(log_reg_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:  # Save TF-IDF vectorizer
    pickle.dump(tfidf_vectorizer, f)

# Optionally: Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
y_pred = log_reg_model.predict(X_test_tfidf)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
Accuracy:  0.9766816143497757
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

