In [1]:
import torch
import torch.nn as nn
import torch.functional as F
import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import Counter


import re
import os



## Importing the CPP data from kaggle

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [23]:
!git clone https://github.com/ggml-org/llama.cpp.git

Cloning into 'llama.cpp'...


In [24]:
import os

# Check that the repo exists in your current working directory
os.listdir("llama.cpp")[:10]

['.clang-format',
 '.clang-tidy',
 '.devops',
 '.dockerignore',
 '.ecrc',
 '.editorconfig',
 '.flake8',
 '.git',
 '.github',
 '.gitignore']

In [25]:
import glob

# Find all .cpp files inside the cloned repo (recursively)
cpp_files = glob.glob("llama.cpp/**/*.cpp", recursive=True)

print(f"✅ Found {len(cpp_files)} .cpp files")
print("Example files:\n", cpp_files[:10])

✅ Found 287 .cpp files
Example files:
 ['llama.cpp\\common\\arg.cpp', 'llama.cpp\\common\\chat-parser.cpp', 'llama.cpp\\common\\chat.cpp', 'llama.cpp\\common\\common.cpp', 'llama.cpp\\common\\console.cpp', 'llama.cpp\\common\\json-partial.cpp', 'llama.cpp\\common\\json-schema-to-grammar.cpp', 'llama.cpp\\common\\llguidance.cpp', 'llama.cpp\\common\\log.cpp', 'llama.cpp\\common\\ngram-cache.cpp']


In [29]:
combined_cpp_text = ""

for file in cpp_files:
    try:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            combined_cpp_text += f.read() + "\n\n"
    except PermissionError:
        print(f"⚠️ Skipping (no permission): {file}")
    except Exception as e:
        print(f"⚠️ Skipping {file} due to error: {e}")

⚠️ Skipping (no permission): llama.cpp\tools\run\linenoise.cpp


In [30]:
len(combined_cpp_text)

9625796

In [34]:
import re
import pandas as pd
from collections import Counter

# ✅ Use your existing in-memory string
cpp_text = combined_cpp_text  # already loaded earlier

# ✅ Tokenize (C++ syntax-aware)
tokens = re.findall(r'[A-Za-z_][A-Za-z0-9_]*|[{}()\[\];.,=+\-*/<>!&|^%~?:]', cpp_text)

# ✅ Count token frequencies
token_counts = Counter(tokens)
df_counts = pd.DataFrame(token_counts.items(), columns=["token", "frequency"])
df_counts = df_counts.sort_values(by="frequency", ascending=False)

# ✅ Show stats
print("Total tokens:", len(tokens))
print("Unique tokens:", len(df_counts))

print("\nTop 20 most frequent tokens:\n")
print(df_counts.head(20).to_string(index=False))

# ✅ Optionally save vocabulary to CSV for later reuse
df_counts.to_csv("cpp_vocabulary.csv", index=False)
print("📁 Saved token frequencies to cpp_vocabulary.csv")

Total tokens: 1920541
Unique tokens: 43226

Top 20 most frequent tokens:

token  frequency
    ,     158051
    )     133160
    (     133136
    ;      97098
    =      75524
    .      55274
    -      52475
    {      48500
    }      48314
    /      47664
    :      45946
    *      38311
    >      36902
    [      27825
    ]      27533
    <      22894
const      20517
    +      19943
    &      15311
   if      11996
📁 Saved token frequencies to cpp_vocabulary.csv


In [35]:
import torch

# ✅ Create mappings (string ↔ integer)
stoi = {s: i for i, s in enumerate(df_counts["token"])}
itos = {i: s for s, i in stoi.items()}

# ✅ Define context size (how many previous tokens to use for prediction)
context_size = 5  # you can experiment with 3–10

# ✅ Prepare X (context) and Y (target) lists
X, Y = [], []

for i in range(len(tokens) - context_size):
    context = tokens[i:i + context_size]
    target = tokens[i + context_size]
    X.append([stoi[t] for t in context])
    Y.append(stoi[target])

print(f"✅ Created {len(X):,} training samples")

✅ Created 1,920,536 training samples


In [36]:
# ✅ Move to torch tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: torch.Size([1920536, 5])
Y shape: torch.Size([1920536])


In [37]:
import torch.nn as nn

class NextTokenPredictor(nn.Module):
    def __init__(self, vocab_size, context_size, embed_dim=64, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(context_size * embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.embed(x)
        x = x.view(x.shape[0], -1)
        x = self.activation(self.fc1(x))
        x = self.fc2(x)
        return x

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(stoi)
model = NextTokenPredictor(vocab_size, context_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

batch_size = 1024
epochs = 20

for epoch in range(epochs):
    total_loss = 0.0
    for i in range(0, len(X), batch_size):
        x_batch = X[i:i+batch_size].to(device)
        y_batch = Y[i:i+batch_size].to(device)

        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

torch.save(model, "cpp_next_token_model.pt")
print("✅ Model training complete and saved!")

Epoch 1/20, Loss: 8178.0161
Epoch 2/20, Loss: 6007.4413
Epoch 3/20, Loss: 5007.2210
Epoch 4/20, Loss: 4357.1789
Epoch 5/20, Loss: 3903.0950
Epoch 6/20, Loss: 3584.7054
Epoch 7/20, Loss: 3348.6311
Epoch 8/20, Loss: 3168.9155
Epoch 9/20, Loss: 3028.4081
Epoch 10/20, Loss: 2907.8445
Epoch 11/20, Loss: 2802.0616
Epoch 12/20, Loss: 2708.4168
Epoch 13/20, Loss: 2626.3248
Epoch 14/20, Loss: 2550.4460
Epoch 15/20, Loss: 2479.0044
Epoch 16/20, Loss: 2414.4028
Epoch 17/20, Loss: 2354.0456
Epoch 18/20, Loss: 2299.0495
Epoch 19/20, Loss: 2249.3460
Epoch 20/20, Loss: 2202.2419
✅ Model training complete and saved!


In [41]:
import random

def generate_next_tokens(model, seed_tokens, num_tokens=10, temperature=1.0):
    model.eval()
    generated = seed_tokens[:]
    for _ in range(num_tokens):
        context = generated[-context_size:]
        x = torch.tensor([[stoi.get(t, 0) for t in context]], device=device)
        logits = model(x) / temperature
        probs = torch.softmax(logits, dim=-1)
        next_token_id = torch.multinomial(probs[0], num_samples=1).item()
        next_token = itos[next_token_id]
        generated.append(next_token)
    return " ".join(generated)

# 🔹 Try generating
# 🔹 Ask user for input text
user_input = input("Enter starting C++ code snippet: ")

# 🔹 Tokenize the input properly (C++ syntax-aware)
user_tokens = re.findall(r'[A-Za-z_][A-Za-z0-9_]*|[{}()\[\];.,=+\-*/<>!&|^%~?:]', user_input)

# 🔹 Handle unknown tokens gracefully
user_tokens = [t for t in user_tokens if t in stoi]
if len(user_tokens) < context_size:
    print(f"⚠️ Input too short, padding with <unk> tokens")
    user_tokens = ['<unk>'] * (context_size - len(user_tokens)) + user_tokens

# 🔹 Generate new code continuation
generated_code = generate_next_tokens(model, user_tokens, num_tokens=30, temperature=1.0)

print("\nGenerated continuation:\n")
print(generated_code)


Generated continuation:

int a = ; b = a + ; } / / if future , * / , , / / with a having position as the best , void everything , json : : / ) ) ;
