In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Display the first few rows of the dataframe
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Drop any unnecessary columns (based on the dataset structure)
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename the columns for easier access
df.columns = ['label', 'message']

# Display basic statistics and structure
print(df.info())
print(df.describe())
print(df['label'].value_counts())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
No

In [2]:
# Convert the labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
print(df.head())


   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download the stopwords from NLTK
nltk.download('stopwords')

# Initialize the PorterStemmer
ps = PorterStemmer()

# Function to clean the text data
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove all characters except letters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]  # Remove stopwords and stem
    text = ' '.join(text)  # Join the words back into a single string
    return text

# Apply the function to clean the text data
df['message'] = df['message'].apply(clean_text)
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amrtutha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


   label                                            message
0      0  go jurong point crazi avail bugi n great world...
1      0                              ok lar joke wif u oni
2      1  free entri wkli comp win fa cup final tkt st m...
3      0                u dun say earli hor u c alreadi say
4      0               nah think goe usf live around though


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)

# Fit and transform the message column
X = tfidf.fit_transform(df['message']).toarray()

# Assign labels to y
y = df['label'].values

print(X.shape, y.shape)


(5572, 3000) (5572,)


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(4457, 3000) (1115, 3000) (4457,) (1115,)


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n {conf_matrix}')
print(f'Classification Report:\n {class_report}')


Accuracy: 0.9748878923766816
Confusion Matrix:
 [[965   0]
 [ 28 122]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [8]:
# Function to predict if a given message is spam or ham
def predict_message(message):
    # Clean the message
    cleaned_message = clean_text(message)
    
    # Transform the message using the TF-IDF vectorizer
    transformed_message = tfidf.transform([cleaned_message]).toarray()
    
    # Predict the label
    prediction = classifier.predict(transformed_message)
    
    # Map the numerical value to the original label
    return 'spam' if prediction == 1 else 'ham'

# Test the function
test_message = "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/123456 to claim now."
print(predict_message(test_message))


spam
