In [1]:
import pandas as pd
import glob
import os
import numpy as np

# Path to search for CSV files
base_path = 'Cpp_experiment'

# Find all particle_times_output.csv files in subfolders
csv_files = glob.glob(os.path.join(base_path, '**', 'particle_times_output.csv'), recursive=True)

# Create an empty list to store dataframes
dfs = []

# Read each CSV file and add it to the list
for file_path in csv_files:
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Add a column to identify which file this data came from
        folder_name = os.path.dirname(file_path).split('/')[-1]  # Get the immediate parent folder name
        
        dfs.append(df)
        print(f"Successfully read: {file_path}")
    except Exception as e:
        print(f"Error reading {file_path}: {str(e)}")

if dfs:
    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save the combined dataset
    output_path = 'combined_particle_times.csv'
    combined_df.to_csv(output_path, index=False)
    print(f"\nCombined dataset saved to: {output_path}")
    print(f"Total number of files processed: {len(dfs)}")
    print(f"Total number of rows in combined dataset: {len(combined_df)}")
else:
    print("No CSV files were found or successfully processed.")

Successfully read: Cpp_experiment/U238_source/particle_times_output.csv
Successfully read: Cpp_experiment/Pu238_source/particle_times_output.csv
Successfully read: Cpp_experiment/CF_252_source/particle_times_output.csv
Successfully read: Cpp_experiment/Pu240_source/particle_times_output.csv
Successfully read: Cpp_experiment/CF_250_source/particle_times_output.csv
Successfully read: Cpp_experiment/Pu242_source/particle_times_output.csv
Successfully read: Cpp_experiment/U235_source/particle_times_output.csv
Successfully read: Cpp_experiment/Pu239_source/particle_times_output.csv

Combined dataset saved to: combined_particle_times.csv
Total number of files processed: 8
Total number of rows in combined dataset: 4170


In [2]:
combined_df.head()

Unnamed: 0,source_file,fuel_density,coolant_density,radius,particle_times
0,u238sourceair.py,18.6,0.001,0.1,[2.37808510e-08 3.18942892e-06 4.47266153e-08 ...
1,u238sourceair.py,18.6,0.001,0.2,[4.75617020e-08 3.18942892e-06 8.94532306e-08 ...
2,u238sourceair.py,18.6,0.001,0.3,[5.30506830e-08 7.66694967e-08 2.46003902e-06 ...
3,u238sourceair.py,18.6,0.001,0.4,[5.30506830e-08 1.03605771e-07 2.46003902e-06 ...
4,u238sourceair.py,18.6,0.001,0.5,[5.30506830e-08 1.29119529e-07 2.46003902e-06 ...


In [6]:
#print columns type
print(combined_df.dtypes)

source_file         object
fuel_density       float64
coolant_density    float64
radius             float64
particle_times      object
dtype: object


In [7]:
# First convert source_file column to string type
combined_df['source_file'] = combined_df['source_file'].astype(str)

# Extract the isotope names using str accessor on the Series
# Use str.extract() to get the pattern, then str.upper() to convert to uppercase
combined_df['isotopes'] = combined_df['source_file'].str.extract(r'([a-zA-Z]+\d+)', expand=False).str.upper()

# Keep only the columns we want and reorder them
columns_to_keep = ['isotopes', 'particle_times']
combined_df = combined_df[columns_to_keep]

# Display the first few rows to verify the changes
combined_df.head()

Unnamed: 0,isotopes,particle_times
0,U238,[2.37808510e-08 3.18942892e-06 4.47266153e-08 ...
1,U238,[4.75617020e-08 3.18942892e-06 8.94532306e-08 ...
2,U238,[5.30506830e-08 7.66694967e-08 2.46003902e-06 ...
3,U238,[5.30506830e-08 1.03605771e-07 2.46003902e-06 ...
4,U238,[5.30506830e-08 1.29119529e-07 2.46003902e-06 ...


In [15]:
import ast
import numpy as np

# Function to safely convert string representation of array to list of floats
def convert_to_float_list(x):
    try:
        # Remove the newline characters and extra spaces
        x = x.replace('\n', ' ').strip()
        # Convert string to numpy array then to list
        # Handle the case where there are ellipsis (...) in the string
        if '...' in x:
            # Split by spaces and filter out the ellipsis
            numbers = [num for num in x.strip('[]').split() if num != '...']
            return [float(num) for num in numbers]
        else:
            # If no ellipsis, convert directly
            return [float(num) for num in x.strip('[]').split()]
    except:
        return []

# Apply the conversion
combined_df['particle_times'] = combined_df['particle_times'].apply(convert_to_float_list)

# Verify the result
print("First row particle times:")
print(combined_df['particle_times'].iloc[0][:5])  # Print first 5 elements to verify
print("\nType of the first element:", type(combined_df['particle_times'].iloc[0]))

First row particle times:
[2.3780851e-08, 3.18942892e-06, 4.47266153e-08, 1.30424059e-05, 3.4852825e-08]

Type of the first element: <class 'list'>


In [18]:
combined_df['particle_times'][0]

[2.3780851e-08,
 3.18942892e-06,
 4.47266153e-08,
 1.30424059e-05,
 3.4852825e-08,
 4.34508841e-06]

In [19]:
combined_df.head()

Unnamed: 0,isotopes,particle_times
0,U238,"[2.3780851e-08, 3.18942892e-06, 4.47266153e-08..."
1,U238,"[4.7561702e-08, 3.18942892e-06, 8.94532306e-08..."
2,U238,"[5.3050683e-08, 7.66694967e-08, 2.46003902e-06..."
3,U238,"[5.3050683e-08, 1.03605771e-07, 2.46003902e-06..."
4,U238,"[5.3050683e-08, 1.29119529e-07, 2.46003902e-06..."


In [21]:
# Create a new column with the length of each particle_times array
combined_df['particle_times_length'] = combined_df['particle_times'].apply(len)

# Get value counts of the lengths
length_counts = combined_df['particle_times_length'].value_counts().sort_index()

print("Distribution of particle times array lengths:")
print(length_counts)

# Optional: Add some basic statistics
print("\nSummary statistics of array lengths:")
print(f"Mean length: {combined_df['particle_times_length'].mean():.2f}")
print(f"Min length: {combined_df['particle_times_length'].min()}")
print(f"Max length: {combined_df['particle_times_length'].max()}")

Distribution of particle times array lengths:
particle_times_length
6    4170
Name: count, dtype: int64

Summary statistics of array lengths:
Mean length: 6.00
Min length: 6
Max length: 6


In [20]:
import numpy as np

# Function to add jitter to a list of times
def add_jitter(times, relative_noise_level=0.05):
    """
    Add Gaussian noise to each time measurement
    
    Parameters:
    times: list of time measurements
    relative_noise_level: standard deviation of the noise as a fraction of the value (default 5%)
    
    Returns:
    List of jittered time measurements
    """
    times = np.array(times)
    # Generate noise with standard deviation proportional to each value
    noise = np.random.normal(0, times * relative_noise_level, size=times.shape)
    # Add noise while ensuring times remain positive
    jittered_times = np.maximum(times + noise, 0)
    return jittered_times.tolist()

# Add a new column with jittered times
combined_df['jittered_particle_times'] = combined_df['particle_times'].apply(add_jitter)

# To verify the jittering, let's print the first row's original and jittered times
print("Original times:")
print(combined_df['particle_times'].iloc[0])
print("\nJittered times:")
print(combined_df['jittered_particle_times'].iloc[0])

Original times:
[2.3780851e-08, 3.18942892e-06, 4.47266153e-08, 1.30424059e-05, 3.4852825e-08, 4.34508841e-06]

Jittered times:
[2.5544028743659276e-08, 3.294172446090311e-06, 4.5067501193098855e-08, 1.2409648267476903e-05, 3.3744576097502e-08, 4.528605715598786e-06]
