In [1]:
from IPython.display import display, Markdown, Code
import tempfile
from pathlib import Path, PurePath
import os
import subprocess
import shlex
import hashlib
import zlib
import time
from typing import Mapping, Tuple, Literal, Optional, List

In [2]:
base = tempfile.mkdtemp(prefix='git-mock-')

display(Code(f'cd {base}'))

In [3]:
def run_cmd(cmd):
    proc = subprocess.run(shlex.split(cmd), capture_output=True, encoding='utf-8')

    display(Code(f'>>> {shlex.join(proc.args)}\n{proc.stdout or proc.stderr}'))

In [4]:
run_cmd('git --version')

### git init

In [5]:
init_empty_dirs = [
    'branches',
    'hooks',
    'info',
    'objects/info',
    'objects/pack',
    'refs/heads',
    'refs/tags'
]

git_dir = PurePath(base).joinpath('.git')
os.mkdir(git_dir)

for d in init_empty_dirs:
    os.makedirs(git_dir.joinpath(d), exist_ok=True)

In [40]:
with open(git_dir.joinpath('config'), 'wt', encoding='ascii') as f:
    f.write('''\
[core]
    repositoryformatversion = 0
    filemode = true
    bare = false
    logallrefupdates = true
''')

In [7]:
with open(git_dir.joinpath('HEAD'), 'wt', encoding='ascii') as f:
    f.write('ref: refs/heads/master')

In [8]:
run_cmd(f'tree {base} -a')

In [9]:
run_cmd(f'git -C {base} status')

### git add

In [10]:
blob_text = 'very first file 1'
blob_file_name = 'first.md'
with open(os.path.join(base, blob_file_name), 'wt', encoding='utf-8') as f:
    f.write(blob_text)

In [11]:
run_cmd(f'tree {base} -a')

In [12]:
run_cmd(f'git -C {base} status')

In [13]:
def write_object(raw_content: bytes, sha1: str, git_dir: PurePath) -> None:
    compressed = zlib.compress(raw_content)
    object_dir = git_dir.joinpath('objects', sha1[:2])
    os.makedirs(object_dir, exist_ok=True)
    with open(object_dir.joinpath(sha1[2:]), 'wb') as f:
        f.write(compressed)

def write_blob_object(file_content: str) -> str:
    raw_content = f'blob {len(file_content)}\0{file_content}'.encode('utf-8')
    sha1 = hashlib.sha1(raw_content).hexdigest()
    
    write_object(raw_content, sha1, git_dir)
        
    return sha1

In [14]:
with open(os.path.join(base, blob_file_name), 'rt', encoding='utf-8') as f:
    file_content = f.read()
    
blob_sha = write_blob_object(file_content)
print(blob_sha)

2a94aaff68840af828318ce66927ef8782d9d5dd


In [15]:
run_cmd(f'git -C {base} cat-file -p {blob_sha}')

In [16]:
run_cmd(f'tree {base} -a')

In [17]:
run_cmd(f'git -C {base} status')

In [18]:
"""
ref: https://git-scm.com/docs/index-format/2.25.0
"""
class IndexEntry:
    def __init__(self, path: str, blob_sha: str, base_path: str):
        self.path = path
        self.blob_sha = blob_sha
        self.base_path = base_path
    
    def to_bytes(self):
        stat = os.stat(self.path)
        
        b = int(stat.st_ctime).to_bytes(4, byteorder='big')
        b += int(stat.st_ctime_ns % 1e9).to_bytes(4, byteorder='big')
        b += int(stat.st_mtime).to_bytes(4, byteorder='big')
        b += int(stat.st_mtime_ns % 1e9).to_bytes(4, byteorder='big')
        
        b += int(stat.st_dev).to_bytes(4, byteorder='big')
        b += int(stat.st_ino).to_bytes(4, byteorder='big')
        b += int('100644', 8).to_bytes(4, byteorder='big')
        b += int(stat.st_uid).to_bytes(4, byteorder='big')
        b += int(stat.st_gid).to_bytes(4, byteorder='big')
        b += int(stat.st_size).to_bytes(4, byteorder='big')
        
        b += bytes.fromhex(self.blob_sha)
        
        assume_valid_flag = 0 << 3
        extended_flag = 0 << 2
        merge_stage_flag = 0
        name_length = len(os.path.basename(self.path)) if len(os.path.basename(self.path)) < 0xfff else 0xfff
        flags = (
            ((assume_valid_flag | extended_flag | merge_stage_flag) << 12) 
            | name_length
        ).to_bytes(2, byteorder='big')
        b += flags
        
        relative_path_name = os.path.relpath(self.path, self.base_path).encode('utf-8')
        b += relative_path_name
        
        padding_size = 8 - (len(b) % 8)
        b += (b'\0' * padding_size)
        
        return b

In [19]:
def write_index_file(entries: List[IndexEntry]) -> None:
    signature = b'DIRC'
    version = (2).to_bytes(4, byteorder='big')
    entries_number = len(entries).to_bytes(4, byteorder='big')
    
    raw_content = signature + version + entries_number + b''.join([e.to_bytes() for e in entries])
    sha1 = hashlib.sha1(raw_content).hexdigest()
    raw_content += bytes.fromhex(sha1)
    
    with open(git_dir.joinpath('index'), 'wb') as f:
        f.write(raw_content)

In [20]:
e = IndexEntry(path=os.path.join(base, blob_file_name), blob_sha=blob_sha, base_path=base)
write_index_file([e])

In [21]:
run_cmd(f'tree {base} -a')

In [22]:
run_cmd(f'git -C {base} status')

### git commit

In [23]:
class TreeEntry:
    def __init__(self, object_type: str, name: str, sha: str):
        assert(object_type in ('tree', 'blob', 'commit', 'tag'))
        self.object_type = object_type
        self.name = name
        self.sha = sha
        self.mode = '100644' if object_type == 'blob' else '40000'

In [24]:
def write_tree_object(entries: List[TreeEntry]) -> str:
    sorted_entries = sorted(entries, key=lambda e: e.name)
    
    entries_content = b''.join([
        f'{e.mode} {e.name}\0'.encode('utf-8') + bytes.fromhex(e.sha) for e in sorted_entries
    ])
    raw_content = f'tree {len(entries_content)}\0'.encode('utf-8') + entries_content
    sha1 = hashlib.sha1(raw_content).hexdigest()
    
    write_object(raw_content, sha1, git_dir)
    
    return sha1

In [25]:
e = TreeEntry(object_type='blob', name=blob_file_name, sha=blob_sha)
tree_sha = write_tree_object([e])
print(tree_sha)

866663a19922bf6851b415da2220ff75a3ba0e06


In [26]:
run_cmd(f'git -C {base} cat-file -p {tree_sha}')

In [27]:
run_cmd(f'tree {base} -a')

In [28]:
run_cmd(f'git -C {base} status')

In [29]:
my_name = 'Soros Liu'
my_email = 'soros.liu1029@gmail.com'

def write_commit_object(tree_sha: str, parent_commmit_sha: Optional[str], msg: str) -> str:
    commit = f'tree {tree_sha}\n' + \
        (f'parent {parent_commmit_sha}\n' if parent_commmit_sha else '') + \
        f'author {my_name} <{my_email}> {int(time.time())} +0800\n' + \
        f'committer {my_name} <{my_email}> {int(time.time())} +0800\n' + \
        '\n' + \
        msg + \
        '\n'
    
    commit_content = commit.encode('utf-8')
    raw_content = f'commit {len(commit_content)}\0'.encode('utf-8') + commit_content
    sha1 = hashlib.sha1(raw_content).hexdigest()
    
    write_object(raw_content, sha1, git_dir)
    
    return sha1

In [30]:
commit_sha = write_commit_object(tree_sha, None, 'first commit')
print(commit_sha)

446ec1d6b4008c41cd4fc82b6e549107963a9ea4


In [31]:
run_cmd(f'git -C {base} cat-file -p {commit_sha}')

In [32]:
run_cmd(f'tree {base} -a')

In [33]:
run_cmd(f'git -C {base} status')

In [34]:
run_cmd(f'git -C {base} log')

### bookmark

In [35]:
with open(git_dir.joinpath('refs', 'heads', 'master'), 'wt', encoding='ascii') as f:
    f.write(commit_sha)

In [36]:
run_cmd(f'tree {base} -a')

In [37]:
run_cmd(f'git -C {base} status')

In [38]:
run_cmd(f'git -C {base} log')

In [39]:
run_cmd(f'git -C {base} fsck --verbose')