Các em download dữ liệu đính kèm. Dựa vào các file trong thư mục train (TrainData), điền nhãn cho các file trong thư mục test

(TestData_nolabel). Có 2 nhãn là spam và notspam. Ví dụ file nhãn sẽ là:

0_unknown.txt,spam
1_unknown.txt,notspam
...

Nộp file code và file nhãn.


# Library

In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


# 1. Read data

## a.Read file spam

In [2]:
df_label: list = []
df_content: list= []
for i in range(18):
    data = pd.read_csv(f"./BaiThi2/TrainData/spam/{i}_spam.txt")
    df_label.append("spam")
    df_content.append(data)
len(df_label)

18

## b.Read file notspam

In [3]:
data_label: list = []
data_content: list = []
for i in range(193):
    df = pd.read_csv(f"./BaiThi2/TrainData/notspam/{i}_notspam.txt")
    data_label.append("not_spam")
    data_content.append(df)
len(data_label)

193

## c.Read file unknown

In [4]:
data_unknown: list = []
for i in range(78):
    dt = pd.read_csv(f"./BaiThi2/TestData_nolabel/{i}_unknown.txt")
    data_unknown.append(dt)
len(data_unknown)


78

## convert to DataFrame

In [5]:
data_train = pd.DataFrame(
    {
        "Label" : pd.Series(df_label + data_label),
        "Content": pd.Series(df_content + data_content)
    }
)
display(data_train)

Unnamed: 0,Label,Content
0,spam,...
1,spam,...
2,spam,...
3,spam,...
4,spam,...
...,...,...
206,not_spam,...
207,not_spam,...
208,not_spam,...
209,not_spam,...


# 2. Preprocess

### chuẩn hóa

In [6]:
data_train['Label'] = data_train['Label'].apply(lambda i : 1 if i == "spam" else 0)

In [7]:
print(data_train)

     Label                                            Content
0        1                                                ...
1        1                                                ...
2        1                                                ...
3        1                                                ...
4        1                                                ...
..     ...                                                ...
206      0                                                ...
207      0                                                ...
208      0                                                ...
209      0                                                ...
210      0                                                ...

[211 rows x 2 columns]


### Function

In [8]:
# create dictionary
def Make_dict(data_train_content: pd.DataFrame):
    all_word: list = []
    for content in data_train_content:
        word = str(content).split()
        all_word += word
    dictionary = Counter(all_word)
    list_dict_remove  = dictionary.keys()
    for key in list(list_dict_remove):
        if key.isalpha() == False:
            del dictionary[key]
        elif len(key) == 1:
            del dictionary[key]
    dictionary = dictionary.most_common(3000)
    return dictionary


In [9]:
# create matrix, label
def extract_features(data_train: pd.DataFrame):
    feature_matrix = np.zeros((data_train.shape[0], 3000))
    train_label = np.array(list(data_train['Label']))
    docId = 0
    for content in data_train['Content']:
        content_used: str = ""
        list_line = list(str(content).split('\n'))
        for i in range(1, len(list_line)):
            content_used += list_line[i]
        words = content_used.split()
        for word in words:
            wordId = 0
            for i, dic in enumerate(dictionary):
                if dic[0] == word:
                    wordId = i
                    feature_matrix[docId, wordId] = words.count(word)
        docId += 1
    return feature_matrix, train_label

# 3. Model

## Split data

In [10]:
train, test = train_test_split(data_train, test_size=0.15)
pd.options.display.max_rows = 130


In [11]:
train['Label'].value_counts()

0    162
1     17
Name: Label, dtype: int64

In [12]:
test['Label'].value_counts()

0    31
1     1
Name: Label, dtype: int64

## Model

In [13]:
dictionary = Make_dict(data_train['Content'])
train_content, train_label = extract_features(train)
test_content, test_label = extract_features(test)

In [14]:
gau = GaussianNB()
gau.fit(train_content, train_label)

In [15]:
label_pre = gau.predict(test_content)
print("accuracy_score: ", accuracy_score(label_pre, test_label))

accuracy_score:  0.96875


In [16]:
print(label_pre)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## predict

In [17]:
tmp_label = [0] * 78
data_nolabel = pd.DataFrame(
    {
        "Label" : pd.Series(tmp_label),
        "Content" : pd.Series(data_unknown)
    }
)

In [18]:
data_nolabel

Unnamed: 0,Label,Content
0,0,...
1,0,...
2,0,Subject: query : tagalog philippine info...
3,0,...
4,0,...
5,0,...
6,0,...
7,0,...
8,0,...
9,0,...


In [19]:
df_unknown, label_tmp = extract_features(data_nolabel)

In [20]:
# predict result
result = gau.predict(df_unknown)

In [21]:
result = pd.Series(result)
result = result.apply(lambda v : "spam" if v == 1 else "notspam")
result = list(result)
print(result)

['notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'spam', 'spam', 'spam', 'notspam', 'notspam', 'notspam', 'spam', 'notspam', 'notspam', 'notspam', 'spam', 'notspam', 'spam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'notspam', 'spam', 'notspam', 'notspam', 'spam', 'spam']


### file label

In [22]:
with open("file_label.txt", "w") as f:
    for i in range(78):
        s = f"{i}_unknown.txt, {result[i]}"
        f.write(s + "\n")