In [20]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from scipy.linalg import qr
import pickle
from tqdm import tqdm
import pywt
#from random import randint

BAD = ['VT', 'VFb', 'VFt'] # KIS,S
project_dir = r'C:\Users\jodge\Documents\School\Summer24\tinymlcontest2022_demo_example'
dir_path = r'C:\Users\jodge\Documents\School\Summer24\tinyml_contest_data_training'
directory_files = os.listdir(dir_path)

"""
Energy captured: ranks needed
 5%: 14  10%: 29  15%: 46  20%: 64   25%:  84
30%: 106 35%: 129 40%: 154 45%: 182  50%:  215
55%: 252 60%: 297 65%: 354 70%: 443  75%:  552
80%: 670 85%: 798 90%: 935 95%: 1084 100%: 1250 
"""

'\nEnergy captured: ranks needed\n 5%: 14  10%: 29  15%: 46  20%: 64   25%:  84\n30%: 106 35%: 129 40%: 154 45%: 182  50%:  215\n55%: 252 60%: 297 65%: 354 70%: 443  75%:  552\n80%: 670 85%: 798 90%: 935 95%: 1084 100%: 1250 \n'

In [2]:
def readObj(filename):
    print(f"reading from {filename}")
    with open(filename, 'rb') as file:
       return pickle.load(file)
def saveObj(obj, filename):
    print(f"saving to {filename}")
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

In [3]:
def setCorrectModel(path, wanted = "original"):
    if wanted == "original":
        #os.rename(os.path.join(path, 'model_1.py.og'), os.path.join(path,'model_1.py'))
        return f"ren {os.path.join(path, 'model_1.py.og')} model_1.py"
    elif wanted == "avgpool":
        #os.rename(os.path.join(path, 'model_1.py.avgpool'), os.path.join(path,'model_1.py'))
        return f"ren {os.path.join(path, 'model_1.py.avgpool')} model_1.py"
        
def resetModel(models_dir, used = "original"):
    if used == "original":
        #os.rename(models_dir, os.path.join(models_dir, 'model_1.py.og'))
        return f"ren {os.path.join(models_dir, 'model_1.py')} model_1.py.og"
    elif used == "avgpool":
        #os.rename(models_dir, os.path.join(models_dir,'model_1.py.avgpool'))
        return f"ren {os.path.join(models_dir, 'model_1.py')} model_1.py.avgpool"
    else:
        print("BAD!!!!!!!!!!!!!")

In [4]:
def prepareData(): #for SVD
    problemSetupsDir = r'C:\Users\jodge\Documents\School\Summer24\playground\data'
    problemSetup = readObj(os.path.join(problemSetupsDir, 'full.pkl'))
    print(problemSetup.r, problemSetup.p) # make sure it's working
        
    for fi in tqdm(directory_files): #special iterator that makes a progress bar
        temp = np.loadtxt(os.path.join(dir_path,fi))
        mse, recon = measureAndReconstruct(problemSetup, temp)
        trunc = problemSetup.C @ temp
        
        outReconName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\recon_data', 'R' + fi) 
        outTruncName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\trunc_data', 'T' + fi)
        np.savetxt(outReconName, recon, fmt='%.7f')
        np.savetxt(outTruncName, trunc, fmt='%.7f')
    
    print("DONE")
    return r, p # ive been using p (which should be r+1) for file naming. idk why i chose that convention.

In [5]:
class SVD:
    def __init__(self, U, S, VT):
        self.U = U
        self.S = S
        self.VT = VT

class ProblemSetup:
  def __init__(self, r, p, fullSVD, truncSVD, Q, R, pivots, C):
      self.r = r # ranks to use
      self.p = p # number of sensors
      self.fullSVD = fullSVD
      self.truncSVD = truncSVD
      self.Q = Q
      self.R = R
      self.pivots = pivots
      self.C = C
      
def getFullTrainSignalMatrix(): #all files with each as a column
    pkl_name = r"fullTrainMat.pkl"
    pkl_loc = os.path.join("data", pkl_name)

    if os.path.exists(pkl_loc):
        answer = readObj(pkl_loc)
        return answer
    else:
        print(f"{pkl_loc} not found - see other notebooks for generation")
        
def processData(trainingData, label, r = 400, p = 500): # r = ranks desired, p = num sensors. bug when r=p, math is hard.
    U, S, VT = svd(trainingData)
    full_SVD = SVD(U, S, VT)
    
    #reshape SVD according to r
    U_hat, S_hat, VT_hat = U[:,:r], S[:r,:r], VT[:r,:]
    trunc_SVD = SVD(U_hat, S_hat, VT_hat)
    
    Q, R, pivots = None, None, None
    if (p == r):
        Q, R, pivots = qr(U_hat, pivoting = True) # or maybe just U
    elif (p > r): # oversampled
        Q, R, pivots = qr(U_hat @ U_hat.T, pivoting = True) # or maybe just U
    else:
        for _ in range(100):
            print("ERROR p < r")
            
    pivots = pivots[:p]
    
    # Create C matrix
    C = np.zeros((p, getFullTrainSignalMatrix().shape[0]))
    #print(C.shape, pivots.shape)
    C[np.arange(p), pivots] = 1

    problemSetup = ProblemSetup(r, p, full_SVD, trunc_SVD, Q, R, pivots, C)
    filename = os.path.join("data", label + ".pkl")
    saveObj(problemSetup, filename)
    
    return problemSetup
    
def svd(x = getFullTrainSignalMatrix()):
    U, S, VT = np.linalg.svd(x, full_matrices=False) #full_matrices=False
    S = np.diag(S)
    return (U, S, VT)

def measureAndReconstruct(problemSetup, signal):
    # Measure a signal
    C, U_hat, p, r, pivots =  problemSetup.C, problemSetup.truncSVD.U, problemSetup.p, problemSetup.r, problemSetup.pivots
    y = C @ signal
    
    # Solve for coefficients
    U_k_reduced = U_hat[:, :p][pivots, :]
    
    if p == r: # broken, bad shapes
        a = np.linalg.lstsq(U_k_reduced, y, rcond=None)[0]
    else:
        a = np.linalg.pinv(C @ U_hat) @ y

    x_reconstructed = U_hat @ a
    mseFinal = np.mean((signal - x_reconstructed) ** 2)
    return mseFinal, x_reconstructed

reading from data\fullTrainMat.pkl


In [6]:
def batGen(rp, numRuns=5, mode = "normal", sampleRate = 1.0, length = 1250, model = "original", kept = 100):
    ans = []
    
    training_script = os.path.join(project_dir, 'training_save_deep_models.py')
    testing_script = os.path.join(project_dir, 'testing_performances.py')
    models_dir = os.path.join(project_dir, 'models')
    results_dir = os.path.join(project_dir, 'results')
    

    recon_indices_dir = os.path.join(project_dir, 'recon_indices\\')
    trunc_indices_dir = os.path.join(project_dir, 'trunc_indices\\')
    drop_indices_dir = os.path.join(project_dir, 'drop_indices\\')
    fft_indices_dir = os.path.join(project_dir, 'fft_indices\\')
    
    data_dir = r'C:\Users\jodge\Documents\School\Summer24'
    recon_data_dir = os.path.join(data_dir, 'recon_data\\')
    trunc_data_dir = os.path.join(data_dir, 'trunc_data\\')
    drop_data_dir = os.path.join(data_dir, 'drop_data\\')
    fft_data_dir = os.path.join(data_dir, 'fft_data\\')

    path_flags = ''
    if mode == 'recon':
        path_flags = f'--path_data {recon_data_dir} --path_indices {recon_indices_dir}'
    elif mode == 'trunc':
        path_flags = f'--path_data {trunc_data_dir} --path_indices {trunc_indices_dir}'
    elif mode == 'drop':
        path_flags = f'--path_data {drop_data_dir} --path_indices {drop_indices_dir}'
    elif mode == 'fft':
        path_flags = f'--path_data {fft_data_dir} --path_indices {fft_indices_dir}'
        

    size_flags = ''
    if length < 98:
        length = 98
    if length != 1250:
        size_flags = f'--size {length}'

    sample_flags = f'--sample_rate {sampleRate}' if sampleRate != 1.0 else ''

    out_train_name = f'train_{mode}_r{rp}_{length}{'_'+int(sampleRate) if sample_flags else ''}{'_k'+str(kept) if kept != 100 else ''}_{model}.txt'
    out_test_name = f'TEST_{mode}_r{rp}_{length}{'_'+int(sampleRate) if sample_flags else ''}{'_k'+str(kept) if kept != 100 else ''}_{model}.csv'
    output_train_file = os.path.join(results_dir, out_train_name)
    output_test_file = os.path.join(results_dir, out_test_name)

    temp_train_out = 'train_temp.txt'
    temp_test_out = 'test_temp.txt'
    train_command = f'\tpython {training_script} {path_flags} {sample_flags} {size_flags} >> {output_train_file}'
    test_command = f'\tpython {testing_script} {path_flags} {sample_flags} {size_flags} >> {output_test_file}'

    loop_head = r'for /l %%i in (1,1,' + str(numRuns) + r') do ('
    loop_tail = ')'

    ans.append(r'@echo off')
    ans.append(setCorrectModel(models_dir, model))  
    ans.append(loop_head)
    
    ans.append('\techo iteration %%i')
    ans.append(train_command)
    ans.append(test_command)
    
    ans.append(loop_tail)
    ans.append(resetModel(models_dir, model))
    ans.append('pause')
    
    bat_name = out_test_name.split(".")[0]
    bat_name = "_".join(bat_name.split("_")[1:]) + ".bat"

    bat = os.path.join("bats", bat_name)
    with open(bat,"w") as output:
        for line in ans:
            output.write(line + '\n')
    
    return ans, bat_name 

In [7]:
def batGenNew(out_train_name, out_test_name, numRuns = 5, indices_folder = None, data_folder=None, length=1250, model = 'original', additional_flags = ''):
    ans = []
    
    training_script = os.path.join(project_dir, 'training_save_deep_models.py')
    testing_script = os.path.join(project_dir, 'testing_performances.py')
    models_dir = os.path.join(project_dir, 'models')
    results_dir = os.path.join(project_dir, 'results')   

    output_train_file = os.path.join(results_dir, out_train_name)
    output_test_file = os.path.join(results_dir, out_test_name)

    path_flags = ''
    if data_folder:
        data_path = os.path.join(r'C:\Users\jodge\Documents\School\Summer24', data_folder)
        path_flags = path_flags + f' --path_data {data_path}'
    if indices_folder:
        indices_path = os.path.join(project_dir, indices_folder)
        path_flags = path_flags + f' --path_indices {indices_path}'
    #path_flags = f'--path_data {data_path} --path_indices {indices_path}' if indices_folder or data_folder else ''

    size_flags = ''
    if length < 98:
        length = 98
    if length != 1250:
        size_flags = f'--size {length}'

    train_command = f'\tpython {training_script} {path_flags} {size_flags} {additional_flags} >> {output_train_file}'
    test_command = f'\tpython {testing_script} {path_flags} {size_flags} {additional_flags} >> {output_test_file}'

    loop_head = r'for /l %%i in (1,1,' + str(numRuns) + r') do ('
    loop_tail = ')'

    ans.append(r'@echo off')
    ans.append(setCorrectModel(models_dir, model))  
    ans.append(loop_head)
    
    ans.append('\techo iteration %%i')
    ans.append(train_command)
    ans.append(test_command)
    
    ans.append(loop_tail)
    ans.append(resetModel(models_dir, model))
    ans.append('pause')
    
    bat_name = out_test_name.split(".")[0]
    bat_name = "_".join(bat_name.split("_")[1:]) + ".bat"

    bat = os.path.join("bats", bat_name)
    with open(bat,"w") as output:
        for line in ans:
            output.write(line + '\n')
    
    return bat_name 

# SVD

In [None]:
r = 552
p = r+1
fullTrainSignalMatrix = getFullTrainSignalMatrix()
problemSetup = processData(fullTrainSignalMatrix, "full", r, p)
r0,p0 = prepareData()
print(r,r0, p,p0) # sanity check

In [None]:
ans, bat_name = batGen(p, mode = "trunc", sampleRate = 1.0, length = p, model="avgpool")
print(bat_name)

# Random Dropout (Static) Mask

In [None]:
def random_drop_mask(keep_percentage, length=1250):
    """ CLAUDE
    Randomly drop data points from a NumPy array.
    
    :param data: Input NumPy array
    :param keep_percentage: Percentage of data points to keep (0-100)
    :return: Filtered NumPy array with the desired percentage of data points
    """
    if not 0 < keep_percentage <= 100:
        raise ValueError("Keep percentage must be between 0 and 100")
    
    # Generate a binary mask with the desired probability of keeping each element
    keep_mask = np.random.binomial(1, keep_percentage / 100, size=length).astype(bool)
    
    # Apply the mask to the original data
    #filtered_data = data[keep_mask]
    
    return keep_mask
    
def randomDropout(keep_percent = 50):
    mask = random_drop_mask(keep_percent)
    print(f"keep: {keep_percent}%. mask: {np.sum(mask)}") 

    for fi in tqdm(directory_files, desc="Dropping out files"): #special iterator that makes a progress bar
        temp = np.loadtxt(os.path.join(dir_path,fi))
        dropped = temp[mask]
        
        outName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\drop_data', 'D' + fi)
        np.savetxt(outName, dropped, fmt='%.7f')
    
    print("DONE")
    return np.sum(mask) # how many data points were kept (should be close to 1250 * keep percent)


In [None]:
keeping = 2.4 #kep 50%
masklen = randomDropout(keeping)

In [None]:
ans, bat_name = batGen(1250, mode = "drop", length = masklen, model = "avgpool", kept = keeping)
print(bat_name)

# FFT

In [None]:
def fftMask(ranks = 250):
    pkl_name = "fftMask" + str(ranks) + ".pkl"
    pkl_loc = os.path.join("data", pkl_name)
    mask = None

    if os.path.exists(pkl_loc):
        mask = readObj(pkl_loc)
    else:
        temp = np.fft.rfft(np.loadtxt(os.path.join(dir_path,directory_files[0])))
        counts = np.zeros_like(temp)
        print()
        for fi in tqdm(directory_files, desc = f"Generating FFT mask of {ranks} ranks."):
            temp = np.loadtxt(os.path.join(dir_path,fi))
            fft = np.fft.rfft(temp)
            
            ranked = np.argsort(np.abs(fft))[::-1]
            top_indices = ranked[:ranks]
            counts[top_indices] += 1
        #print(f"counts {counts}")
        
        plt.plot(counts)
        plt.title("counts")
        plt.show()
            
        final_indices = np.argsort(counts)[-ranks:]
        #print(f"{len(final_indices)} final idxs: {final_indices}")
        mask = np.zeros_like(counts, dtype=bool)
        mask[final_indices] = True
        saveObj(mask, pkl_loc)
        
    print(f"mask of len {len(mask)} ready")
    return mask

def fftSparseGen(mask, ranks, trunc = False):
    for fi in tqdm(directory_files, desc="Files Processing"):
        temp = np.loadtxt(os.path.join(dir_path,fi))
        fft = np.fft.rfft(temp)

        if trunc:
            done = fft[:][mask]
        else: # zeroes otherwise
            done = np.where(mask, fft, 0) 
        #print(len(done))
        
        outName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\fft_data', 'F' + fi)
        np.savetxt(outName, done.real, fmt='%.7f')
    
    print("DONE")

In [None]:
#static_mask = np.zeros(626, dtype=bool)
#pass_band_filter = list(range(15,55))
#static_mask[pass_band_filter] = True

ranks = 50 #len(pass_band_filter)
truncate = True
fftSparseGen(fftMask(ranks), ranks, trunc = truncate)

In [None]:
len_temp = ranks if truncate else 1250
ans, bat_name = batGen(ranks, mode = "fft", length = len_temp, model = "avgpool")
print(bat_name)

# Wavelet

In [8]:
def basicDWTProcess(w='db4', m='zero'):
    data = np.loadtxt(os.path.join(dir_path,directory_files[1]))
    cA, cD = pywt.dwt(data, wavelet=w, mode=m)
    size = len(cA)
    
    for fi in tqdm(directory_files, desc="Files Processing"):
        data = np.loadtxt(os.path.join(dir_path,fi))
        cA, cD = pywt.dwt(data, wavelet=w, mode=m)
        outName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\temp_data', fi)
        np.savetxt(outName, cA, fmt='%.7f')
    
    return size #return length after processing
    
def waveletDecomp(w='db4', m='zero', l=5, r=3):
    data = np.loadtxt(os.path.join(dir_path,directory_files[1]))
    coeffs = pywt.wavedec(data, w, mode=m, level=l)
    #print(coeffs[r])
    size = len(coeffs[r])
    
    for fi in tqdm(directory_files, desc=f"{w} {m} {l}"):
        data = np.loadtxt(os.path.join(dir_path,fi))
        coeffs = pywt.wavedec(data, w, mode=m, level=l)
        outName = os.path.join(r'C:\Users\jodge\Documents\School\Summer24\temp_data', fi)
        np.savetxt(outName, coeffs[r], fmt='%.7f')
    
    return size #return length after processing  

### simple DWT

In [None]:
w = 'db4'
m = 'zero'
length = basicDWTProcess(w,m)

mod = 'avgpool'
out_train_name = f'train_dwt_{w}_{m}_r{length}_l{length}_{model}.txt'
out_test_name = f'TEST_dwt_{w}_{m}_r{length}_l{length}_{model}.csv'

data_folder = "temp_data\\"

bat_name = batGenNew(out_train_name, out_test_name, numRuns = 5, data_folder=data_folder, model = mod, length=length)
bat_name

### configurable wavelet decomp

In [19]:
w = 'db4' # wavelet_list = ['db4', 'db6', 'db8', 'sym4', 'sym6', 'sym8', 'coif3', 'coif4', 'bior3.3', 'bior3.5']

m = 'periodic' # zero symmetric periodic
levels = 5
wanted = 3 #informed by waveletMaskCounter, seems to always be 3 at level 5 decomp
length = waveletDecomp(w,m,l=levels,r=wanted)

size = 98 if length < 98 else length

mod = 'avgpool'
w = w.replace('.','-')
file_setup = f'{w}_{m}_{levels}-{wanted}_r{length}-{size}_{mod}'
out_train_name = f'train_{file_setup}.txt'
out_test_name = f'TEST_{file_setup}.csv'

data_folder = "temp_data\\"

bat_name = batGenNew(out_train_name, out_test_name, numRuns = 5, data_folder=data_folder, model = mod, length=length)
bat_name

db4 periodic 5: 100%|███████████████████████████████████████████████████████████| 30213/30213 [00:46<00:00, 648.06it/s]


'db4_periodic_5-3_r162-162_avgpool.bat'