In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle


In [8]:
# Load dataset
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [11]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [9]:
# Display the first few rows
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
# Select and rename the relevant columns
df = df[['v1', 'v2']]  # Replace 'v1' and 'v2' with actual column names
df.columns = ['label', 'message']

In [13]:
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [14]:
# Text preprocessing: converting text to lowercase
df['message_cleaned'] = df['message'].apply(lambda x: x.lower())

# Check the cleaned data
df[['message', 'message_cleaned']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message_cleaned'] = df['message'].apply(lambda x: x.lower())


Unnamed: 0,message,message_cleaned
0,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


In [16]:
#Train-Test Split

from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message_cleaned'], df['label'], test_size=0.2, random_state=42)

# Check the size of the train and test sets
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 4457
Testing set size: 1115


In [17]:
# Vectorization


from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizing the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Check the shape of the transformed data
print(f"TF-IDF train shape: {X_train_vec.shape}")
print(f"TF-IDF test shape: {X_test_vec.shape}")

TF-IDF train shape: (4457, 7735)
TF-IDF test shape: (1115, 7735)


In [18]:
## MOdel Training

from sklearn.naive_bayes import MultinomialNB

# Initialize the MultinomialNB model
model = MultinomialNB()

# Train the model
model.fit(X_train_vec, y_train)

# Check the training completion
print("Model training completed.")

Model training completed.


In [19]:
# Model evaluation

from sklearn.metrics import accuracy_score

# Predicting on the test set
y_pred = model.predict(X_test_vec)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.96


In [20]:
## Model saving and Vectorization

import pickle

# Save the trained model to a pickle file
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the vectorizer to a pickle file
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and vectorizer saved to pickle files.")

Model and vectorizer saved to pickle files.
