In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [41]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **الف)**

In [42]:
data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/trspam.csv', on_bad_lines='skip', verbose=False, header=0, names=['email', 'class'])
x = list(data['email'])
y = list(data['class'])

In [43]:
print("Number of data", data.shape[0])

missing_count = data.isnull().sum()
print("Number of missing values: ")
print(missing_count)

data_cleaned = data.dropna()
print("Number of data after delete missing values: ", data_cleaned.shape[0])

Number of data 701
Number of missing values: 
email    1
class    0
dtype: int64
Number of data after delete missing values:  700


In [44]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

# **ب)**

In [45]:
def preprocess_text(text):
    tokens = word_tokenize(text)

    tokens = [token.lower() for token in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return " ".join(lemmatized_tokens)

In [None]:
train_data['email_processed'] = train_data['email'].apply(preprocess_text)
test_data['email_processed'] = test_data['email'].apply(preprocess_text) 

In [39]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['email_processed'])
X_test = vectorizer.transform(test_data['email_processed'])

y_train = train_data['class']
y_test = test_data['class']

# **پ)**

In [None]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
results = {}

In [25]:
print("Linear Kernel:\n")
for C in C_values:
    model = SVC(C=C, kernel='linear', random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    support_vectors = model.n_support_

    results[C] = {
        "accuracy": accuracy,
        "support_vectors": support_vectors
    }

    print(f"C={C}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Number of Support Vectors (per class): {support_vectors}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

best_C = max(results, key=lambda c: results[c]["accuracy"])
print(f"Best C Value: {best_C} with Accuracy: {results[best_C]['accuracy']:.4f}")
print(f"Support Vectors for Best C: {results[best_C]['support_vectors']}")


Linear Kernel:

C=0.001
Accuracy: 0.8815
Number of Support Vectors (per class): [175 126]
              precision    recall  f1-score   support

         ham       0.83      0.99      0.90       118
        spam       0.99      0.74      0.85        93

    accuracy                           0.88       211
   macro avg       0.91      0.87      0.88       211
weighted avg       0.90      0.88      0.88       211

--------------------------------------------------
C=0.01
Accuracy: 0.9289
Number of Support Vectors (per class): [160  87]
              precision    recall  f1-score   support

         ham       0.90      0.98      0.94       118
        spam       0.98      0.86      0.91        93

    accuracy                           0.93       211
   macro avg       0.94      0.92      0.93       211
weighted avg       0.93      0.93      0.93       211

--------------------------------------------------
C=0.1
Accuracy: 0.9431
Number of Support Vectors (per class): [144  81]
         

In [47]:
print("Polynomial Kernel:\n")

degrees = [1, 2, 3]
coef0_values = [-1, 0, 1]

for degree in degrees:
    for coef0 in coef0_values:
        print(f"Degree={degree}, Coef0={coef0}")
        model = SVC(C=1, kernel='poly', degree=degree, coef0=coef0, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred))
        print("-" * 50)


Polynomial Kernel:

Degree=1, Coef0=-1
Accuracy: 0.7488
              precision    recall  f1-score   support

         ham       0.69      0.99      0.82       118
        spam       0.98      0.44      0.61        93

    accuracy                           0.75       211
   macro avg       0.83      0.72      0.71       211
weighted avg       0.82      0.75      0.72       211

--------------------------------------------------
Degree=1, Coef0=0
Accuracy: 0.7488
              precision    recall  f1-score   support

         ham       0.69      0.99      0.82       118
        spam       0.98      0.44      0.61        93

    accuracy                           0.75       211
   macro avg       0.83      0.72      0.71       211
weighted avg       0.82      0.75      0.72       211

--------------------------------------------------
Degree=1, Coef0=1
Accuracy: 0.7488
              precision    recall  f1-score   support

         ham       0.69      0.99      0.82       118
        s

In [27]:
print("RBF (Radial Basis Function):\n")

gamma_values = [1 / X_train.shape[1], 1 / (X_train.shape[1] ** 2)]  # 1/K و 1/K^2

for gamma in gamma_values:
    print(f"Gamma={gamma}")
    model = SVC(C=1, kernel='rbf', gamma=gamma, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

RBF (Radial Basis Function):

Gamma=2.9652473016249555e-05
Accuracy: 0.6493
              precision    recall  f1-score   support

         ham       0.62      0.98      0.76       118
        spam       0.91      0.23      0.36        93

    accuracy                           0.65       211
   macro avg       0.77      0.60      0.56       211
weighted avg       0.75      0.65      0.58       211

--------------------------------------------------
Gamma=8.79269155979408e-10
Accuracy: 0.5592
              precision    recall  f1-score   support

         ham       0.56      1.00      0.72       118
        spam       0.00      0.00      0.00        93

    accuracy                           0.56       211
   macro avg       0.28      0.50      0.36       211
weighted avg       0.31      0.56      0.40       211

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
