In [2]:
github_list = '''https://github.com/GwLewis369/Hand-written-digit-classification-data.git
https://github.com/gbaonr/CS114_handwritten_digits_data
https://github.com/adamwhite625/CS114_hand_written_digit.git
https://github.com/theRaven1312/CS114.P21
https://github.com/Khoiisme1905/CS114.git
https://github.com/ProjectHT1/machinelearning
https://github.com/Searching96/hand-written-digit.git
https://github.com/Salmon1605/CS114
https://github.com/anhtuann1224/hand_written_digit
https://github.com/NATuanAN/Hand_written_digit_classification_data.git
https://github.com/huapogba/may-hoc
https://github.com/hieutran890j2/CS114.git
https://github.com/votanhoang483/CS114.P21-Hand_written_digit_classification
https://github.com/DHPh/CS114_hand_written_digit/
https://github.com/thaituanUIT/ReminiScenceAI
https://github.com/anngyn/CS114-Hand-Written-Digit
https://github.com/lngphgthao/cs114-hand-written-digit-classification/tree/main/hand_written_digit
https://github.com/Toan02Ky-UIT/CS114
https://github.com/Lochke/CS114_Handwritten_Digit_Classification.git
https://github.com/NThong325/CS114/tree/cfd654a14dd471f5272387139d586ddcbf9cdf7e/hand_written_digit
https://github.com/toanlamdata/digit-recognition-group
https://github.com/Nohenshin/CS114.P21-2025-
https://github.com/hmcslearning/ML1142025
https://github.com/lngphgthao/cs114-hand-written-digit-classification
https://github.com/huapogba/may-hoc
https://github.com/NThong325/CS114/tree/main/hand_written_digit
https://github.com/23520276/Hand-written-digit-classification/
https://github.com/NThong325/CS114
'''

data_dir = r'E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data'

In [3]:
import os
import shutil
import subprocess
import re

os.makedirs(data_dir, exist_ok=True)
eight_digit_pattern = re.compile(r"^\d{8}$")
existing_digits = {name for name in os.listdir(data_dir) if eight_digit_pattern.match(name)}

def is_leaf_folder(path):
    return all(not os.path.isdir(os.path.join(path, f)) for f in os.listdir(path))

for url in github_list.strip().split():
    user_repo = url.replace("https://github.com/", "").replace(".git", "")
    repo_dir = user_repo.replace("/", "_")
    clone_path = os.path.join(data_dir, repo_dir)

    if os.path.exists(clone_path):
        shutil.rmtree(clone_path, ignore_errors=True)

    print(f"\nCloning {url} → {clone_path}")
    try:
        subprocess.run(["git", "clone", "--depth", "1", url, clone_path], check=True)
    except Exception as e:
        print(f"Failed to clone {url}: {e}")
        continue

    for root, dirs, files in os.walk(clone_path):
        if ".git" in root:
            continue

        for d in dirs:
            full_path = os.path.join(root, d)
            if is_leaf_folder(full_path) and eight_digit_pattern.match(d):
                if d in existing_digits:
                    print(f"Skipping existing folder: {d}")
                    continue
                dest_path = os.path.join(data_dir, d)
                shutil.copytree(full_path, dest_path)
                existing_digits.add(d)
                print(f"Copied: {full_path} → {dest_path}")

    try:
        escaped_path = clone_path.replace("/", "\\")
        os.system(f'rmdir /s /q "{escaped_path}"')
    except Exception as e:
        print(f"Failed to delete {clone_path}: {e}")



Cloning https://github.com/GwLewis369/Hand-written-digit-classification-data.git → E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\GwLewis369_Hand-written-digit-classification-data
Copied: E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\GwLewis369_Hand-written-digit-classification-data\hand_written_digit\22520369 → E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\22520369
Copied: E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\GwLewis369_Hand-written-digit-classification-data\hand_written_digit\23521387 → E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\23521387
Copied: E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\GwLewis369_Hand-written-digit-classification-data\hand_written_digit\23521419 → E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\23521419

Cloning https://github.com/gbaonr/CS114_handwritten_digits_data → E:\C

In [4]:
import os
import glob
from tabulate import tabulate

pattern = os.path.join(data_dir, '??52????')
matching_folders = glob.glob(pattern)

matching_folders.append(os.path.join(data_dir, "additional"))

table = []

for folder in matching_folders:
    mssv = os.path.basename(folder)
    row = [mssv]

    digit_counts = []
    for num in range(10):
        files = glob.glob(os.path.join(folder, f"{num}_*"))
        digit_counts.append(len(files))

    total = sum(digit_counts)
    row.extend(digit_counts)
    row.append(total)
    table.append(row)

totals = ["Total"]
for col in range(1, 12):
    total = sum(row[col] for row in table)
    totals.append(total)
table.append(totals)

headers = ['MSSV', *map(str, range(10)), 'Sum']
print(tabulate(table, headers=headers))


MSSV          0    1    2    3    4    5    6    7    8    9    Sum
----------  ---  ---  ---  ---  ---  ---  ---  ---  ---  ---  -----
21522592     12   12   16   12   12    7    8    8    6    6     99
21522689     10   10   10   10   10   10   10   10   10   10    100
22520019     13   16   13   13   13   11   14   15   12   13    133
22520068     10   10   10   10   10   10   10   10   10   10    100
22520069     10   10   10   10   10   10   10   10   10   10    100
22520077     10   10   10   10   10   10   10   10   10   10    100
22520109     10   10   10   10   10   10   10   10   10   10    100
22520192     21   21   21   21   21   21   21   21   21   21    210
22520236     10   10   10   10   10   10   10   10   10   10    100
22520369     15   15   15   15   15   15   15   15   15   15    150
22520483     10   10   10   10   10   10   10   10   10   10    100
22520696     10   10   10   10   10   10   10   10   10   10    100
22520710     10   10   10   10   10   10   10   

In [6]:
import os
import shutil
#sort
sort_folder = r'E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\sorted'
if os.path.exists(sort_folder):
    shutil.rmtree(sort_folder)
os.makedirs(sort_folder)

for i in range(10):
    label_folder = os.path.join(sort_folder, str(i))    
    os.makedirs(label_folder)

image_extensions = {'.jpg', '.jpeg', '.png', '.jfif'}
id = 0
skip = 0
for dirpath, dirnames, filenames in os.walk(data_dir):
    if dirpath == data_dir:
        continue
    
    for filename in filenames:
        ext = os.path.splitext(filename)[1].lower()
        src_path = os.path.join(dirpath, filename)
        
        if ext not in image_extensions:
            print(f"Skipped (not image): {src_path}")
            skip += 1
            continue
        
        label = filename.split('_')[0]
        try:
            digit = int(label)
            if 0 <= digit <= 9:
                dst_path = os.path.join(sort_folder, str(digit), str(id) + "_" + filename)
                shutil.copy(src_path, dst_path)
                id += 1
            else:
                print(f"Skipped (digit out of range): {src_path}")
        except ValueError:
            print(f"Skipped (invalid label): {src_path}")
            continue

print(f"Total remain image: {id}")     
print(f"Total skip image: {skip}")    

Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_blue1.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_blue2.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_blue3.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_blue4.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_blue5.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_red1.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_red2.HEIC
Skipped (not image): E:\Code\Github\CS114.P21_project\digit_classification\data\image_raw_v1\data\22520077\0_red3.HEIC
Skipped (not image): E:\Code\Github\CS114.P