In [None]:


import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re  
import string 
import nltk
from nltk.corpus import stopwords 
from nltk.stem import LancasterStemmer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
import warnings 
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding="latin-1", usecols= ["v1", "v2"])

In [None]:
df.columns=["Type", "Message"]

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = LancasterStemmer()

In [None]:
def cleaning_data(text):
    text = text.lower()  # lowercase
    text = re.sub(r'@\S+', '', text)  # remove mentions
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'.pic\S+', '', text)  # remove media links
    text = re.sub(r'[^a-zA-Z+]', ' ', text)  # keep only letters
    text = "".join([i for i in text if i not in string.punctuation])  # remove punctuation
    words = nltk.word_tokenize(text)  # tokenize
    text = " ".join([i for i in words if i not in stop_words and len(i) > 2])  # remove stopwords & short words
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

In [None]:
df["CleanMessage"] = df["Message"].apply(cleaning_data)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["CleanMessage"])

In [None]:
encoder = OneHotEncoder(sparse=False, drop='first')
Y = encoder.fit_transform(df[["Type"]])

In [None]:
df["Message_Length"] = df["Message"].apply(len)

plt.figure(figsize=(8, 6))
sns.histplot(data=df, x="Message_Length", hue="Type", bins=50, kde=True)
plt.title("Distribution of Message Length by Type")
plt.show()

In [None]:
from collections import Counter

def top_n_words(messages, n=20):
    words = " ".join(messages).split()
    return Counter(words).most_common(n)

top_spam_words = top_n_words(df[df["Type"] == "spam"]["CleanMessage"])
top_ham_words = top_n_words(df[df["Type"] == "ham"]["CleanMessage"])

spam_df = pd.DataFrame(top_spam_words, columns=['Word', 'Freq'])
ham_df = pd.DataFrame(top_ham_words, columns=['Word', 'Freq'])

# Plot spam
plt.figure(figsize=(10, 4))
sns.barplot(data=spam_df, x='Freq', y='Word', color='red')
plt.title("Top Words in Spam Messages")
plt.show()

# Plot ham
plt.figure(figsize=(10, 4))
sns.barplot(data=ham_df, x='Freq', y='Word', color='green')
plt.title("Top Words in Ham Messages")
plt.show()


In [None]:
sns.countplot(x="Type", data=df)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train,Y_train)

In [None]:
model.score(X_train,Y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)

In [None]:
emails = [
    'Hey Mohamed, can we get together to watch football game tomorrow?',
    "URGENT: Your account will be locked. Verify your information immediately to avoid suspension!",
    "Congratulations! You've been selected to win a $1000 gift card. Click the link to claim your reward now!"
]
emails_count = vectorizer.transform(emails)
model.predict(emails_count)