In [1]:
import torch
import torch.nn as nn
import torch.functional as F
import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import Counter


import re
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
import os

# Check that the repo exists in your current working directory
os.listdir("GIANT-Python-Code")[:10]

['.git',
 '.gitignore',
 'Algorithm',
 'ExperimentLogistic',
 'ExperimentQuadratic',
 'LICENSE',
 'LinuxSetup.sh',
 'README.md',
 'Resource',
 'Util']

In [7]:
import glob
import py_compile
import os

# Change this to your repository’s root folder
repo_path = "GIANT-Python-Code"

# Find all .py files inside the repo (recursively)
python_files = [f for f in glob.glob(os.path.join(repo_path, "**", "*.py"), recursive=True) if os.path.isfile(f)]

print(f"Found {len(python_files)} .py files")
print("Example files:\n", python_files[:10])

compiled_count = 0
for file in python_files:
    try:
        py_compile.compile(file, cfile=file + "c", doraise=True)
        compiled_count += 1
    except (py_compile.PyCompileError, PermissionError) as e:
        print(f"❌ Failed to compile: {file}\nError: {e}")

print(f"\nSuccessfully compiled {compiled_count}/{len(python_files)} Python files.")

Found 17 .py files
Example files:
 ['GIANT-Python-Code\\Algorithm\\ExecutorLogistic.py', 'GIANT-Python-Code\\Algorithm\\ExecutorQuadratic.py', 'GIANT-Python-Code\\Algorithm\\Solver.py', 'GIANT-Python-Code\\ExperimentLogistic\\demo.py', 'GIANT-Python-Code\\ExperimentLogistic\\experiment1.py', 'GIANT-Python-Code\\ExperimentLogistic\\experiment1ec2.py', 'GIANT-Python-Code\\ExperimentLogistic\\experiment2.py', 'GIANT-Python-Code\\ExperimentQuadratic\\demo.py', 'GIANT-Python-Code\\ExperimentQuadratic\\experiment1.py', 'GIANT-Python-Code\\ExperimentQuadratic\\experiment2.py']

Successfully compiled 17/17 Python files.


In [8]:
import os

combined_python_text = ""

for file in python_files:
    if not os.path.isfile(file):
        continue  # Skip directories just in case
    try:
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            combined_python_text += f.read() + "\n\n"
    except PermissionError:
        print(f"Skipping (no permission): {file}")
    except Exception as e:
        print(f"Skipping {file} due to error: {e}")

print(f"\n✅ Combined {len(python_files)} Python files into a single text string.")
print(f"Total combined text length: {len(combined_python_text):,} characters")


✅ Combined 17 Python files into a single text string.
Total combined text length: 55,501 characters


In [9]:
import re
import pandas as pd
from collections import Counter

# Use your existing in-memory string
python_text = combined_python_text  # already combined earlier

# Tokenize: match identifiers, keywords, numbers, and punctuation
tokens = re.findall(r'[A-Za-z_][A-Za-z0-9_]*|[{}()\[\];.,=+\-*/<>!&|^%~?:]', python_text)

# Count token frequencies
token_counts = Counter(tokens)
df_counts = pd.DataFrame(token_counts.items(), columns=["token", "frequency"])
df_counts = df_counts.sort_values(by="frequency", ascending=False)

# Show stats
print("Total tokens:", len(tokens))
print("Unique tokens:", len(df_counts))

print("\nTop 20 most frequent tokens:\n")
print(df_counts.head(20).to_string(index=False))

# Optionally save vocabulary to CSV for later reuse
df_counts.to_csv("python_vocabulary.csv", index=False)
print("Saved token frequencies to python_vocabulary.csv")


Total tokens: 13628
Unique tokens: 521

Top 20 most frequent tokens:

   token  frequency
       =       1582
       ,       1292
       .       1019
       )        970
       (        970
   numpy        275
       +        268
    self        264
       :        227
       -        202
       [        179
       ]        179
       m        158
       d        132
       *        130
     plt        125
dataname        111
       /        100
       n         96
       X         95
Saved token frequencies to python_vocabulary.csv


In [11]:
import torch

# Create mappings (string ↔ integer)
stoi = {s: i for i, s in enumerate(df_counts["token"])}
itos = {i: s for s, i in stoi.items()}

# Define context size (how many previous tokens to use for prediction)
context_size = 5  # you can adjust between 3–10

# Prepare X (context) and Y (target) lists
X, Y = [], []

for i in range(len(tokens) - context_size):
    context = tokens[i:i + context_size]
    target = tokens[i + context_size]
    X.append([stoi[t] for t in context])
    Y.append(stoi[target])

# Convert to PyTorch tensors
X = torch.tensor(X, dtype=torch.long)
Y = torch.tensor(Y, dtype=torch.long)

print(f"Created {len(X):,} training samples")
print(f"Input tensor shape: {X.shape}")
print(f"Target tensor shape: {Y.shape}")

Created 13,623 training samples
Input tensor shape: torch.Size([13623, 5])
Target tensor shape: torch.Size([13623])


In [12]:
# Move to torch tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: torch.Size([13623, 5])
Y shape: torch.Size([13623])


  X = torch.tensor(X)
  Y = torch.tensor(Y)


In [13]:
import torch.nn as nn

class NextTokenPredictor(nn.Module):
    def __init__(self, vocab_size, context_size, embed_dim=64, hidden_dim=256):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(context_size * embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.embed(x)
        x = x.view(x.shape[0], -1)
        x = self.activation(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(stoi)
model = NextTokenPredictor(vocab_size, context_size).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

batch_size = 1024
epochs = 30

for epoch in range(epochs):
    total_loss = 0.0
    for i in range(0, len(X), batch_size):
        x_batch = X[i:i+batch_size].to(device)
        y_batch = Y[i:i+batch_size].to(device)

        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

torch.save(model, "python_model1.pt")
print("Model training complete and saved!")