In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tarfile

def extract_tar_file(tar_path='/content/drive/MyDrive/빅데이터응용보안/trec07p.tar', extract_path='/content/drive/MyDrive/빅데이터응용보안/trec07'):
    """
    Extracts a tar file to a specified directory.

    Args:
    tar_path (str): The path to the tar file.
    extract_path (str): The directory to extract the files into. Defaults to the current directory.
    """
    with tarfile.open(tar_path, 'r') as tar:
        tar.extractall(path=extract_path)
        print(f"Extracted {tar_path} to {extract_path}")

extract_tar_file('/content/drive/MyDrive/빅데이터응용보안/trec07p.tar')


Extracted /content/drive/MyDrive/빅데이터응용보안/trec07p.tar to /content/drive/MyDrive/빅데이터응용보안/trec07


In [None]:
DATA_DIR = '/content/drive/MyDrive/빅데이터응용보안/trec07/trec07p/data/'
LABELS_FILE = '/content/drive/MyDrive/빅데이터응용보안/trec07/trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [None]:
labels = {}
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [None]:
def read_email_files():
    X = []
    y = []
    for i in range(len(labels)):
        filename = 'inmail.' + str(i+1)
        email_str = email_read_util.extract_email_text(
            os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

In [None]:
!pip install nltk
import nltk
nltk.download('stopwords')
#email_read_util을 불러오기 위함



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
%cd /content/drive/MyDrive/빅데이터응용보안/trec07/
import email_read_util
import os

/content/drive/MyDrive/빅데이터응용보안/trec07


In [None]:
X, y = read_email_files()

# CNN 모델 build 과정

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, Dense

def build_model(vocab_size, embedding_dim, num_filters, kernel_size, pool_size, dropout_rate, dense_units, num_conv_layers):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=1000))  # Character를 Embedding할 layer

    # convolutional layer와 pooling layers 추가
    for _ in range(num_conv_layers):
        model.add(Conv1D(num_filters, kernel_size, activation='relu'))
        model.add(MaxPooling1D(pool_size))

    model.add(Flatten())
    model.add(Dropout(dropout_rate))
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Output layer

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# model parameter
VOCAB_SIZE = 256  # ASCII 문자: 256 size
EMBEDDING_DIM = 50
NUM_FILTERS = 64
KERNEL_SIZE = 3
POOL_SIZE = 2
DROPOUT_RATE = 0.2
DENSE_UNITS = 256
NUM_CONV_LAYERS = 5

# 모델 빌드
cnn_model = build_model(VOCAB_SIZE, EMBEDDING_DIM, NUM_FILTERS, KERNEL_SIZE, POOL_SIZE, DROPOUT_RATE, DENSE_UNITS, NUM_CONV_LAYERS)
cnn_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 50)          12800     
                                                                 
 conv1d (Conv1D)             (None, 998, 64)           9664      
                                                                 
 max_pooling1d (MaxPooling1  (None, 499, 64)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 497, 64)           12352     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 248, 64)           0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 246, 64)           1

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

def preprocess_email(text):
    #텍스트 전처리: 1. HTML tag 제거, 2. 소문자로 일괄 변환, 3. 특수문자 제거
    text = tf.strings.regex_replace(text, "<[^>]+>", " ")  # HTML tag 제거
    text = tf.strings.regex_replace(text, "[^a-zA-Z0-9 ]", "")  # 특수문자
    text = tf.strings.lower(text) #소문자
    return text

# X: raw email text data, y : 0과 1로 된 list
X_clean = [preprocess_email(email).numpy().decode('utf-8') for email in X]

# character embedding : 문자 index mapping으로 text를 인코딩하기
char_index = {chr(i): i for i in range(256)}  # text를 character 중 ASCII(256) 로 매핑하기

def encode_texts(texts, max_length=1000):
    #text를 padding된 integer로 encode함
    encoded = [[char_index.get(char, 0) for char in text] for text in texts]
    return pad_sequences(encoded, maxlen=max_length, padding='post', truncating='post')

X_encoded = encode_texts(X_clean)

# X,y를 일괄 encoding하고 나서 split하기
TRAINING_SET_RATIO = 0.7
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_encoded, y, range(len(y)),
    train_size=TRAINING_SET_RATIO, random_state=2
)

#array 형태로 바꿔줘야 train이 됨.
y_train2 = np.array(y_train)
y_test2 = np.array(y_test)

# 모델 train
cnn_model.fit(X_train, y_train2, epochs=50, validation_data=(X_test, y_test2))
#epoch 100이 원본 파라미터이나, 성능에 차이가 없어 50으로 하향 조정


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f236c2d6fe0>

In [None]:
len(X_train)

52793

In [None]:
len(X_test)

22626

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report

#cnn model predict된 결과는 마지막 sigmoid 함수의 결과로 probability로 나온다.
#그렇기에 0.5 이하인 확률을 0, 0.5 이상인 확률을 1로 변환해준다.
y_pred_prob = cnn_model.predict(X_test)
y_pred_class = (y_pred_prob > 0.5).astype(int)

precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
accuracy = accuracy_score(y_test, y_pred_class)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"Accuracy: {accuracy:.2f}")


Precision: 0.99
Recall: 0.98
Accuracy: 0.99


In [None]:
print(classification_report(y_test,y_pred_class,target_names=['Spam','Ham']))
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test,y_pred_class)))

              precision    recall  f1-score   support

        Spam       0.99      1.00      0.99     15035
         Ham       0.99      0.98      0.98      7591

    accuracy                           0.99     22626
   macro avg       0.99      0.99      0.99     22626
weighted avg       0.99      0.99      0.99     22626

Classification accuracy 99.0%
