<a href="https://colab.research.google.com/github/Tuevu110405/AIO_Module_6/blob/feature%2FCNN-week1/train_CNN_W1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from torchsummary import summary

import matplotlib.pyplot as plt
from PIL import Image



ROOT = './data'
train_data = datasets.MNIST(root=ROOT, train=True, download=True)
test_data = datasets.MNIST(root=ROOT, train=False, download=True)

In [None]:


import time

In [None]:
VALID_RATIO = 0.9

n_train_examples = int(len(train_data)* VALID_RATIO)
n_valid_examples = len(train_data) - n_train_examples

train_data, valid_data = data.random_split(train_data, [n_train_examples, n_valid_examples])

mean = train_data.dataset.data.float().mean() /255
std = train_data.dataset.data.float().std() /255

train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean = [mean], std = [std])
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean = [mean], std = [std])
])

train_data.dataset.transform = train_transforms
valid_data.dataset.transform = test_transforms

BATCH_SIZE = 256

train_dataloader = data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = data.DataLoader(valid_data, batch_size=BATCH_SIZE)

Xây dựng mô hình LeNet

In [None]:
class LeNetClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels = 1, out_channels = 6, kernel_size = 5, padding = 'same'
        )
        self.avgpool1 = nn.AvgPool2d(kernel_size = 2)
        self.conv2 = nn.Conv2d(
            in_channels = 6, out_channels = 16, kernel_size = 5)
        self.avgpool2 = nn.AvgPool2d(kernel_size = 2)
        self.flatten = nn.Flatten()
        self.fc_1 = nn.Linear(16*5*5, 120)
        self.fc_2 = nn.Linear(120, 84)
        self.fc_3 = nn.Linear(84, num_classes)

    def forward(self, inputs):
        outputs = self.conv1(inputs)
        outputs = self.avgpool1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = self.avgpool2(outputs)
        outputs = F.relu(outputs)
        outputs = self.flatten(outputs)
        outputs = self.fc_1(outputs)

        outputs = self.fc_2(outputs)

        outputs = self.fc_3(outputs)
        return outputs

In [None]:
def train(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        loss = criterion(predictions, labels)
        losses.append(loss.item())

        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(  # Changed line: Removed the comma
                    epoch, idx, len(train_dataloader),
                    total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_loss, epoch_acc

In [None]:
#Evaluation function
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            losses.append(loss.item())
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    epoch_acc = total_acc/total_count
    epoch_loss = sum(losses)/len(losses)
    return epoch_acc, epoch_loss

Training:

In [None]:
num_classes = len(train_data.dataset.classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lenet_model = LeNetClassifier(num_classes).to(device)

optimizer = optim.Adam(lenet_model.parameters())
criterion = nn.CrossEntropyLoss()

num_epochs = 10
save_model = './models'


train_accs = []
train_losses = []
eval_accs = []
eval_losses = []
best_loss_eval = 100

for epoch in range(1, num_epochs+1):
    epoch_start_time = time.time()
    #Training
    train_acc, train_loss = train(lenet_model, optimizer, criterion,
                                  train_dataloader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    #Evaluation
    eval_acc, eval_loss = evaluate(lenet_model, criterion, valid_dataloader)
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    #Save best model
    if eval_loss < best_loss_eval:
        torch.save(lenet_model.state_dict(),save_model + '/lenet_model.pt')

    # Print loss, acc and epoch
    print("-" * 59)
    print("| End of epoch {:3d} | time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} |"
          "Valid Accuracy {:8.3f} | Valid Loss {:8.3f} |".format(
              epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
          )
    )
    print("-" *59)
    #Load th best model
    lenet_model.load_state_dict(torch.load(save_model + '/lenet_model.pt'))
    lenet_model.eval()




In [None]:
test_data.transform = test_transforms
test_dataloader = data.DataLoader(test_data, batch_size=BATCH_SIZE)
test_acc, test_loss = evaluate(lenet_model, criterion, test_dataloader)
print(f"Test Accuracy: {test_acc}, Test Loss: {test_loss}")

Test Accuracy: 0.9646, Test Loss: 0.10873495448613539


Cassava Leaf Disease

In [None]:
# prompt: Tải bộ dữ liệu cassava leaf disease từ googleapis

!pip install -U -q kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"YOUR_KAGGLE_USERNAME","key":"YOUR_KAGGLE_API_KEY"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d datamunge/cassava-leaf-disease-classification
!unzip cassava-leaf-disease-classification.zip -d cassava-leaf-disease

In [None]:
!wget --no-check-certificate https://storage.googleapis.com/emcassavadata/cassavaleafdata.zip -O /content/cassavaleafdata.zip
!unzip /content/cassavaleafdata.zip

import os
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from torchsummary import summary

import matplotlib.pyplot as plt
from PIL import Image


In [None]:
data_paths = {
    'train' : './train',
    'valid' : './valid/',
    'test' : './test'
}

# load image from path
def loader(path):
    return Image.open(path)

img_size = 150
train_transforms = transforms.Compose([
    transforms.Resize((150, 150)),
    transforms.ToTensor(),

])

valid_data = datasets.ImageFolder(root = data_paths['valid'], transform = train_transforms)
test_data = datasets.ImageFolder(root = data_paths['test'], transform = train_transforms)

In [None]:
class LeNetClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(
            in_channels = 3, out_channels = 6, kernel_size = 5, padding = 'same'
        )
        self.avgpool1 = nn.AvgPool2d(kernel_size = 2)
        self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 16, kernel_size = 5)
        self.avgpool2 = nn.AvgPool2d(kernel_size = 2)
        self.flatten = nn.Flatten()
        self.fc_1 = nn.Linear(16*35*35, 120)
        self.fc_2 = nn.Linear(120, 84)
        self.fc_3 = nn.Linear(84, num_classes)

    def forward(self, inputs):
        outputs = self.conv1(inputs)
        outputs = self.avgpool1(outputs)
        outputs = F.relu(outputs)
        outputs = self.conv2(outputs)
        outputs = self.avgpool2(outputs)
        outputs = F.relu(outputs)
        outputs = self.flatten(outputs)
        outputs = self.fc_1(outputs)

        outputs = self.fc_2(outputs)

        outputs = self.fc_3(outputs)
        return outputs

In [None]:
def train(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        loss = criterion(predictions, labels)
        losses.append(loss.item())

        # backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(  # Changed line: Removed the comma
                    epoch, idx, len(train_dataloader),
                    total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_loss, epoch_acc

In [None]:
#Evaluation function
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            losses.append(loss.item())
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    epoch_acc = total_acc/total_count
    epoch_loss = sum(losses)/len(losses)
    return epoch_acc, epoch_loss

In [None]:
num_classes = len(train_data.dataset.classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lenet_model = LeNetClassifier(num_classes).to(device)
learning_rate = 2e-4
optimizer = optim.Adam(lenet_model.parameters(), learning_rate)
criterion = nn.CrossEntropyLoss()

num_epochs = 10
save_model = './models'


train_accs = []
train_losses = []
eval_accs = []
eval_losses = []
best_loss_eval = 100

for epoch in range(1, num_epochs+1):
    epoch_start_time = time.time()
    #Training
    train_acc, train_loss = train(lenet_model, optimizer, criterion,
                                  train_dataloader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    #Evaluation
    eval_acc, eval_loss = evaluate(lenet_model, criterion, valid_dataloader)
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    #Save best model
    if eval_loss < best_loss_eval:
        torch.save(lenet_model.state_dict(),save_model + '/lenet_model_casseva.pt')

    # Print loss, acc and epoch
    print("-" * 59)
    print("| End of epoch {:3d} | time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} |"
          "Valid Accuracy {:8.3f} | Valid Loss {:8.3f} |".format(
              epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
          )
    )
    print("-" *59)
    #Load th best model
    lenet_model.load_state_dict(torch.load(save_model + '/lenet_model_casseva.pt'))
    lenet_model.eval()




In [None]:
test_acc, test_loss = evaluate(lenet_model, criterion, test_dataloader)
print(f"Test Accuracy: {test_acc}, Test Loss: {test_loss}")

In [None]:
!git clone https://github.com/congnghia0609/ntc-scv.git
!unzip ./ntc-scv/data/data_test.zip -d ./data
!unzip ./ntc-scv/data/data_train.zip -d ./data

import os
import pandas as pd

def load_data_from_path(folder_path):
    examples = []
    for label in os.listdir(folder_path):
        full_path = os.path.join(folder_path, label)
        for file_name in os.listdir(full_path):
            file_path = os.path.join(full_path, file_name)
            with open(file_path, "r", encoding = "utf-8") as f:
                lines = f.readlines()
            sentence = " ".join(lines)
            if label == "neg":
                label = 0
            if label == "pos":
                label = 1
            data = {
                'sentence': sentence,
                'label': label
            }
            examples.append(data)
    return pd.DataFrame(examples)

folder_paths = {
    'train': './data/data_train/train',
    'valid': './data/data_train/valid',
    'test': './data/data_test/test'
}

train_data = load_data_from_path(folder_paths['train'])
valid_data = load_data_from_path(folder_paths['valid'])
test_data = load_data_from_path(folder_paths['test'])


In [None]:
!pip install langid

from langid.langid import LanguageIdentifier, model
def identity_vn(df):
    identifier = LanguageIdentifier.from_pickled_model(model, norm_probs=True)\
    not_vi_idx = set()
    THRESHOLD = 0.9
    for idx, row in df.iterrows():
        score = identifier.classify(row['sentence'])
        if score[0] != "vi" or (score[0] == "vi" and score[1] <= THRESHOLD):
            not_vi_idx.add(idx)
    vi_df = df[~ df.index.isin(not_vi_idx)]
    not_vi_df = df[df.index.isin(not_vi_idx)]
    return vi_df, not_vi_df

train_df_vi, train_df_other = identity_vn(train_data)


– Xoá bỏ thẻ HTML, đường dẫn URL

– Xoá bỏ dấu câu, số

–Xoá bỏ các ký tự đặc biệt, emoticons,...

– Chuẩn hoá khoảng trắng

– Chuyển sang viết thường

In [None]:
import re
import string
def preprocess_text(text):

    url_pattern = re.compile(r'https?://\s+\wwww\.\s+')
    text = url_pattern.sub(r" ", text)

    html_pattern = re.compile(r'<[^<>]+>')
    text = html_pattern.sub(" ", text)

    replace_chars = list(string.punctuation + string.digits)
    for char in replace_chars:
        text = text.replace(char, " ")

    import re

    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\u2702-\u27B0"
        u"\u24C2-\u1F251"
        u"\u1F926-\u1F937"
        u"\u1F1F2"
        u"\u1F1F4"
        u"\u1F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r" ", text)

    text = " ".join(text.split())
    text = text.lower()
    return text

train_df_vi['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for idx, row in train_df_vi.iterrows()
]

valid_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for idx, row in valid_df.iterrows()
]

test_df['preprocess_sentence'] = [
    preprocess_text(row['sentence']) for idx, row in test_df.iterrows()
]


In [None]:
!pip install -q torchtext==0.16.0

#word-based tokenizer
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

def yield_tokens(sentences, tokenizer):
    for sentence in sentences:
        yield tokenizer(sentence)

from torchtext.vocab import build_vocab_from_iterator

vocab_size = 10000
vocabulary = build_vocab_from_iterator(
    yield_tokens(train_df_vi['preprocess_sentence'], tokenizer),
    specials=["<pad>","<unk>"],
    max_tokens=vocab_size
)
vocabulary.set_default_index(vocabulary["<unk>"])

from torchtext.data.functional import to_map_style_dataset
def prepare_dataset(df):
    for index, row in df.iterrows():
        sentence = row['preprocess_sentence']
        encoded_sentence = vocabulary(tokenizer(sentence))
        label = row['label']
        yield (sentence, label)

train_dataset = to_map_style_dataset(prepare_dataset(train_df_vi))
valid_dataset = to_map_style_dataset(prepare_dataset(valid_df))


In [None]:
import torch
from torch.utils.data import DataLoader

def collate_batch(batch):
    encoded_sentences, labels = [], []
    for sentence, label in batch:
        labels.append(label)
        encoded_sentences.append(encoded_sentence, dtype= torch.int64)
        encoded_sentences.append(encoded_sentence)

    labels = torch.tensor(labels, dtype=torch.int64)
    encoded_sentences = pad_sequence(
        encoded_sentences,
        padding_value=vocabulary["<pad>"]
    )

    return encoded_sentences, labels

batch_size = 128
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = collate_batch
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size = batch_size,
    shuffle = True,
    collate_fn = collate_batch
)



In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, kernel_sizes, num_filters, num_classes):
        super(TextCNN, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.kernel_sizes = kernel_sizes
        self.num_filters = num_filters
        self.num_classes = num_classes
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx= 0)
        self.conv = nn.ModuleList([
            nn.Conv1d(
                in_channels = embedding_dim,
                out_channels = num_filters,
                kernel_size = k
                stride = 1
        ) for k in kernel_sizes])
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, num_classes)

    def forward(self, x):
        batch_size, sequence_length = x.size()
        x = self.embedding(x.T).transpose(1,2)
        x = [F.relu(conv(x)) for conv in self.conv]
        x = [F.max_pool1d(c,c.size(-1)).squeeze(dim= -1) for c in x]
        x = torch.cat(x, dim = 1)
        x = self.fc(x)
        return x

In [None]:
#Training
import time
def train(model, optimizer, criterion, train_dataloader, device, epoch=0, log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    losses = []
    start_time = time.time()

    for idx, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        predictions = model(inputs)

        loss = criterion(predictions, labels)
        losses.append(loss.item())

        # backward
        loss.backward()

        optimizer.step()
        total_acc += (predictions.argmax(1) == labels).sum().item()
        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(  # Changed line: Removed the comma
                    epoch, idx, len(train_dataloader),
                    total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

    epoch_acc = total_acc / total_count
    epoch_loss = sum(losses) / len(losses)
    return epoch_loss, epoch_acc

In [None]:
#Evaluation function
def evaluate(model, criterion, valid_dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    losses = []
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(valid_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            losses.append(loss.item())
            total_acc += (predictions.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
    epoch_acc = total_acc/total_count
    epoch_loss = sum(losses)/len(losses)
    return epoch_acc, epoch_loss

In [None]:
num_class = 2
vocab_size = len(vocabulary)
embedding_dim = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextCNN(
    vocab_size = vocab_size,
    embedding_dim = embedding_dim,
    kernel_sizes = [3,4,5],
    num_filters = 100,
    num_classes = num_class
)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 10
save_model = './model'

train_accs, train_losses = [], []
eval_accs, eval_losses = [], []
best_loss_eval = 100

for epoch in range(1, num_epochs + 1):
    epoch_start_time = time.time()
    #Training
    train_acc, train_loss = train(model, optimizer, criterion, train_dataloader, device, epoch)
    train_accs.append(train_acc)
    train_losses.append(train_loss)

    #Evaluation
    eval_acc, eval_loss = evaluate(model, criterion, valid_dataloader)
    eval_accs.append(eval_acc)
    eval_losses.append(eval_loss)

    #save best model
    if eval_loss < best_loss_eval:
        torch.save(model.state_dict(), save_model + '/text_cnn_model.pt')

    #Print loss, acc end epoch
    print("-" * 59)
    print("| End of epoch {:3d} | time: {:5.2f}s | Train Accuracy {:8.3f} | Train Loss {:8.3f} |"
          "Valid Accuracy {:8.3f} | Valid Loss {:8.3f} |".format(
              epoch, time.time() - epoch_start_time, train_acc, train_loss, eval_acc, eval_loss
          )
    )
    print("-" *59)

    #load best model
    model.load_state_dict(torch.load(save_model + '/text_cnn_model.pt'))
    model.eval()

In [None]:
test_dataset = prepare_dataset(test_df)
test_dataset = to_map_style_dataset(test_dataset)
test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    shuffle = False,
    collate_fn = collate_batch
)

test_acc, test_loss = evaluate(model, criterion, test_dataloader)
print(f"Test Accuracy: {test_acc}, Test Loss: {test_loss}")