# Task 1

In [4]:
import pandas as pd

data = pd.read_csv("spam.csv", encoding="ISO-8859-1")
data = data[['v1','v2']]
data.columns = ['label','text']

data.head()
# data.info()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Handling missing values

In [5]:
data.isnull().sum()
# no missing values

label    0
text     0
dtype: int64

### Encoding 

In [6]:

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [7]:
data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### Text Cleaning

In [8]:
import nltk
print(nltk.__version__)

3.9.2


In [9]:
import nltk, string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\salah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def text_cleaning(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['clean_text'] = data['text'].apply(text_cleaning)


### Feature Extraction

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

X = data['clean_text']
y = data['label']

cv = CountVectorizer(max_features=5000)
X_vec = cv.fit_transform(X)


### Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)

# Task 2

## Training Different Models

### Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


### Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


### Xgboost

In [24]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    random_state=42
)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)


### Naive bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)


### Evaluation

In [33]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Naive bayes
acc_nb = accuracy_score(y_test,y_pred_nb)
pre_nb = precision_score(y_test,y_pred_nb)
rec_nb = recall_score(y_test,y_pred_nb)
f1_nb = f1_score(y_test,y_pred_nb)

# Logistic Regression
acc_lr = accuracy_score(y_test,y_pred_lr)
pre_lr = precision_score(y_test,y_pred_lr)
rec_lr = recall_score(y_test,y_pred_lr)
f1_lr = f1_score(y_test,y_pred_lr)

# Random Forest
acc_rf = accuracy_score(y_test,y_pred_rf)
pre_rf = precision_score(y_test,y_pred_rf)
rec_rf = recall_score(y_test,y_pred_rf)
f1_rf = f1_score(y_test,y_pred_rf)

#Xgboost
acc_xgb = accuracy_score(y_test,y_pred_xgb)
pre_xgb = precision_score(y_test,y_pred_xgb)
rec_xgb = recall_score(y_test,y_pred_xgb)
f1_xgb = f1_score(y_test,y_pred_xgb)

In [34]:
results_df = pd.DataFrame({
    "Model": ["Naive Bayes", "Logistic Regression", "Random Forest", "XGBoost"],
    "Accuracy": [acc_nb, acc_lr, acc_rf, acc_xgb],
    "Precision": [pre_nb, pre_lr, pre_rf, pre_xgb],
    "Recall": [rec_nb, rec_lr, rec_rf, rec_xgb],
    "F1-score": [f1_nb, f1_lr, f1_rf, f1_xgb]
})

results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Naive Bayes,0.977578,0.931034,0.9,0.915254
1,Logistic Regression,0.979372,1.0,0.846667,0.916968
2,Random Forest,0.979372,1.0,0.846667,0.916968
3,XGBoost,0.973094,0.983871,0.813333,0.890511


In [35]:
final_model = lr  

import pickle

with open("model.pkl","wb") as f:
    pickle.dump(final_model,f)

with open("vectorizer.pkl","wb") as f:
    pickle.dump(cv,f)


# Task 3

### Why this model was chosen
I trained multiple models like Naive Bayes, Logistic Regression, Random Forest, and XGBoost. Among them, Logistic Regression and Random Forest performed the best, especially in terms of F1-score, which balances both precision and recall. Since spam detection needs to avoid both false alarms and missed spam, these models were the most reliable.I chose Logistic Regression to do Task 4

### How features impact prediction
Before training, all messages were cleaned and converted into numbers using CountVectorizer.CountVectorizer works by turning each message into a vector based on the frequency of words, so the machine learning model can understand text in numerical form. This helped the model learn from important words in the messages. For example, words like “win” and “claim” likely to appear in spam, while normal conversation words usually indicate real and genuine messages. The model uses these patterns to decide whether a message is spam or not.

### What improvements can be done
This system can be improved by using TF-IDF instead of simple word counts to give more importance to meaningful words. The model performance can also be increased by tuning hyperparameters using techniques like cross-validation and grid search. In the future, more advanced approaches such as deep learning models (LSTM or Transformers) can be used to better understand the context of messages and further improve accuracy.