# Clustering

In [1]:
from forge.core.database import DatabaseManager
from forge.analysis.composition import CompositionAnalyzer
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import yaml

# 1. Connect to AWS database
db_config = {
    'database': {
        'dbname': 'test_database',
        'user': 'myless',
        'password': 'vcrtiwzr',
        'host': 'database-vcrtiwzr.cfg4i4qmuc4m.us-east-1.rds.amazonaws.com',
        'port': 5432
    }
}

db_manager = DatabaseManager(config_dict=db_config)



In [2]:
# 2. Get unique compositions directly from database
print("Fetching unique compositions from database...")
with db_manager.conn.cursor() as cur:
    cur.execute("""
        SELECT DISTINCT composition 
        FROM structures
        WHERE composition IS NOT NULL
    """)
    rows = cur.fetchall()

# Convert JSON compositions to dictionaries of atomic fractions
compositions = []
for row in rows:
    comp_json = row[0]  # Get the composition JSON
    # Convert counts to fractions
    total_atoms = sum(data['num_atoms'] for data in comp_json.values())
    comp_dict = {
        element: data['num_atoms'] / total_atoms 
        for element, data in comp_json.items()
    }
    compositions.append(comp_dict)

print(f"Found {len(compositions)} unique compositions")

Fetching unique compositions from database...
Found 160 unique compositions


In [25]:
compositions[3]

{'V': 0.7983870967741935,
 'W': 0.08870967741935484,
 'Cr': 0.08064516129032258,
 'Ti': 0.016129032258064516,
 'Zr': 0.016129032258064516}

In [3]:
# 3. Initialize and run composition analysis
analyzer = CompositionAnalyzer(n_components=3, random_state=42)

print("Running t-SNE and clustering analysis...")
embeddings, clusters = analyzer.analyze_compositions(compositions, n_clusters=5)



Running t-SNE and clustering analysis...


In [4]:
# 4. Generate new compositions within constraints
print("Generating new composition suggestions...")
new_compositions = analyzer.suggest_new_compositions(
    compositions,
    n_suggestions=100,
    constraints={
        'V': (0.8, 0.95),      # V must be between 70-100%
        'Cr': (0.0, 0.1),     # Cr, Ti, W, Zr each must be between 0-30%
        'Ti': (0.0, 0.1),
        'W': (0.0, 0.1),
        'Zr': (0.0, 0.02)
    }
)


Generating new composition suggestions...


In [5]:
# 5. Visualize results
#fig = plt.figure(figsize=(12, 8))
#ax = fig.add_subplot(111, projection='3d')

# Combine original and new compositions for t-SNE
all_compositions = compositions + new_compositions
print(f"Total compositions for t-SNE: {len(all_compositions)}")

# Convert all compositions to array format
all_comp_array = analyzer._compositions_to_array(all_compositions)
print(f"Combined composition array shape: {all_comp_array.shape}")

# Run t-SNE on all compositions together
all_embeddings = analyzer.tsne.fit_transform(all_comp_array)
print(f"Combined embeddings shape: {all_embeddings.shape}")

# Split embeddings back into original and new
original_embeddings = all_embeddings[:len(compositions)]
new_embeddings = all_embeddings[len(compositions):]

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Calculate statistics for each dimension
mean = np.mean(all_embeddings, axis=0)
std = np.std(all_embeddings, axis=0)
n_std = 2  # Number of standard deviations to consider as threshold

# Create masks for outliers
mask_original = np.all(np.abs(original_embeddings - mean) < n_std * std, axis=1)
mask_new = np.all(np.abs(new_embeddings - mean) < n_std * std, axis=1)

# Create the 3D scatter plot
fig = go.Figure()

# Plot original compositions with their clusters (filtered)
fig.add_trace(go.Scatter3d(
    x=original_embeddings[mask_original, 0],
    y=original_embeddings[mask_original, 1],
    z=original_embeddings[mask_original, 2],
    mode='markers',
    marker=dict(
        size=6,
        color=np.array(clusters)[mask_original],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title='Cluster')
    ),
    text=[f"Cluster {c}<br>" + "<br>".join([f"{k}: {v:.3f}" for k, v in comp.items()]) 
          for c, comp in zip(np.array(clusters)[mask_original], np.array(compositions)[mask_original])],
    hoverinfo='text',
    name='Existing'
))

# Plot new compositions in red (filtered)
fig.add_trace(go.Scatter3d(
    x=new_embeddings[mask_new, 0],
    y=new_embeddings[mask_new, 1],
    z=new_embeddings[mask_new, 2],
    mode='markers',
    marker=dict(
        size=10,
        color='red',
        symbol='diamond'
    ),
    text=["Suggested<br>" + "<br>".join([f"{k}: {v:.3f}" for k, v in comp.items()]) 
          for comp in np.array(new_compositions)[mask_new]],
    hoverinfo='text',
    name='Suggested'
))

# Update the layout
fig.update_layout(
    title='Interactive Composition Space Analysis',
    scene=dict(
        xaxis_title='t-SNE 1',
        yaxis_title='t-SNE 2',
        zaxis_title='t-SNE 3'
    ),
    width=1000,
    height=800,
    showlegend=True
)

# Show the plot
fig.show()

Total compositions for t-SNE: 213
Combined composition array shape: (213, 5)
Combined embeddings shape: (213, 3)


In [6]:
# 6. Print suggested compositions
print("\nSuggested new compositions:")
for i, comp in enumerate(new_compositions, 1):
    print(f"\nComposition {i}:")
    for element, fraction in comp.items():
        print(f"  {element}: {fraction:.3f}")


Suggested new compositions:

Composition 1:
  Cr: 0.041
  Ti: 0.077
  V: 0.825
  W: 0.037
  Zr: 0.019

Composition 2:
  Cr: 0.020
  Ti: 0.019
  V: 0.927
  W: 0.015
  Zr: 0.019

Composition 3:
  Cr: 0.041
  Ti: 0.076
  V: 0.828
  W: 0.037
  Zr: 0.019

Composition 4:
  Cr: 0.013
  Ti: 0.014
  V: 0.949
  W: 0.010
  Zr: 0.014

Composition 5:
  Cr: 0.022
  Ti: 0.044
  V: 0.901
  W: 0.021
  Zr: 0.013

Composition 6:
  Cr: 0.016
  Ti: 0.032
  V: 0.927
  W: 0.015
  Zr: 0.011

Composition 7:
  Cr: 0.018
  Ti: 0.018
  V: 0.931
  W: 0.014
  Zr: 0.018

Composition 8:
  Cr: 0.031
  Ti: 0.059
  V: 0.866
  W: 0.028
  Zr: 0.016

Composition 9:
  Cr: 0.013
  Ti: 0.015
  V: 0.947
  W: 0.010
  Zr: 0.015

Composition 10:
  Cr: 0.014
  Ti: 0.029
  V: 0.933
  W: 0.013
  Zr: 0.010

Composition 11:
  Cr: 0.025
  Ti: 0.048
  V: 0.892
  W: 0.023
  Zr: 0.013

Composition 12:
  Cr: 0.013
  Ti: 0.015
  V: 0.948
  W: 0.010
  Zr: 0.015

Composition 13:
  Cr: 0.044
  Ti: 0.082
  V: 0.815
  W: 0.040
  Zr: 0.020

Comp

# Running MCMC on made compositions

In [1]:
import numpy as np
from ase.build import bulk
from forge.core.database import DatabaseManager
from forge.workflows.mcmc import MonteCarloAlloySampler
from mace.calculators.mace import MACECalculator

# 1) Fetch or build your initial supercell
#atoms = bulk("V", "bcc", a=3.03) * (4, 4, 4)  # 128 atoms if 2 atoms/cell * 4^3
db_config = {
    'database': {
        'dbname': 'test_database',
        'user': 'myless',
        'password': 'vcrtiwzr',
        'host': 'database-vcrtiwzr.cfg4i4qmuc4m.us-east-1.rds.amazonaws.com',
        'port': 5432
    }
}
db_manager = DatabaseManager(config_dict=db_config)
# Optionally randomize the composition a bit, or retrieve from your DB
# [Your code here to randomize or fetch structure]

# 1.5) Get a structure from the database without a calculation attached to it 
structures = db_manager.find_structures_without_calculation(model_type="vasp-static")


  _Jd, _W3j_flat, _W3j_indices = torch.load(os.path.join(os.path.dirname(__file__), 'constants.pt'))


In [2]:
from forge.analysis.composition import CompositionAnalyzer
# Example composition
composition = {
    'V': 0.91515,
    'W': 0.021,
    'Cr': 0.02385,
    'Ti': 0.03,
    'Zr': 0.01
}

analyzer = CompositionAnalyzer()
atoms = analyzer.create_random_alloy(
    composition=composition,
    crystal_type='bcc',
    dimensions=[4, 4, 4],
    lattice_constant=3.01,
    balance_element='V',
    cubic=True
)

In [3]:
print(atoms)

Atoms(symbols='Cr4Ti4V115W3Zr2', pbc=True, cell=[12.04, 12.04, 12.04])


In [4]:
import forge
from pathlib import Path
forge_root = Path(forge.__file__).parent

model_path = forge_root / "tests" / "resources" / "potentials" / "mace" / "gen_5_model_0-11-28_stagetwo.model"
# 2) Initialize your ML potential
calc = MACECalculator(model_paths=[model_path],
                      device="cuda",
                      default_dtype="float32")

# 3) Setup Monte Carlo sampler
temperature = 1200.0
steps_per_atom = 100  # e.g., 30 swaps per atom
total_swaps = steps_per_atom * len(atoms) 

mc_sampler = MonteCarloAlloySampler(
    atoms=atoms,
    calculator=calc,
    temperature=temperature,
    steps=total_swaps,
    allowed_species=["V","Cr","Ti","W","Zr"]
)

# 4) Run MC simulation
print("Starting MC simulation...")
final_atoms = mc_sampler.run_mcmc()
print("MC simulation complete.")

  torch.load(f=model_path, map_location=device)


Starting MC simulation...
MC simulation complete.


In [6]:
from ase.io import write

# write before and after atoms to xyz
write('./scratch/data/before_atoms.xyz', atoms)
write('./scratch/data/after_atoms.xyz', final_atoms)


In [8]:
print(atoms)
print(final_atoms)

Atoms(symbols='Cr4Ti4V115W3Zr2', pbc=True, cell=[12.04, 12.04, 12.04])
Atoms(symbols='Cr4Ti4V115W3Zr2', pbc=True, cell=[12.04, 12.04, 12.04], calculator=MACECalculator(...))


In [None]:
# 5) Optionally add final_atoms to your DB
db_config = {
    'database': {
        'dbname': 'test_database',
        'user': 'myless',
        'password': '***',
        'host': 'database-vcrtiwzr.cfg4i4qmuc4m.us-east-1.rds.amazonaws.com',
        'port': 5432
    }
}
db_manager = DatabaseManager(config_dict=db_config)
structure_id = db_manager.add_structure(final_atoms, source_type="MC_Sampler")
print(f"Added MC-refined structure ID: {structure_id}")


# Making VASP Jobs from Suggested Compositions

In [None]:
# using the selected compositions, let's make VASP relaxation jobs for each of them



# Database Working

In [40]:
from forge.core.database import DatabaseManager
from ase.io import read
import os

# 1. Your AWS database credentials
db_config = {
    'database': {
        'dbname': 'test_database',  # The name of the database you created on RDS
        'user': 'myless',           # Replace with your RDS username
        'password': 'vcrtiwzr',     # Replace with your RDS password
        'host': 'database-vcrtiwzr.cfg4i4qmuc4m.us-east-1.rds.amazonaws.com',
        'port': 5432                # Default Postgres port unless you changed it
    }
}

# 2. Instantiate the DatabaseManager
db_manager = DatabaseManager(config_dict=db_config)

# (Optional) If you need to drop and recreate tables, uncomment:
# with db_manager.conn.cursor() as cur:
#     cur.execute("""
#         DROP TABLE IF EXISTS calculations CASCADE;
#         DROP TABLE IF EXISTS structures CASCADE;
#     """)
# db_manager.conn.commit()
# db_manager._initialize_tables()

## Adding New Structures

In [10]:
from ase.io import read, write

data = read('./scratch/data/random_100_gen6.xyz', index=':')
#print(data[0])
#print(data[0].info)
#print(data[0].calc)
write('./scratch/data/random_100_gen6_fixed.xyz', data[5:])

In [13]:
db_manager.add_structures_from_xyz(
    xyz_file='./scratch/data/job_gen_6-2024-12-23.xyz',
    skip_duplicates=True,
    default_model_type="vasp-static"
)

[INFO] Found 20308 frames in ./scratch/data/job_gen_6-2024-12-23.xyz
[INFO] Processing frame 0 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=101
[INFO] Added calculation, ID=101
[INFO] Processing frame 1 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=102
[INFO] Added calculation, ID=102
[INFO] Processing frame 2 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=103
[INFO] Added calculation, ID=103
[INFO] Processing frame 3 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=104
[INFO] Added calculation, ID=104
[INFO] Processing frame 4 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=105
[INFO] Added calculation, ID=105
[INFO] Processing frame 5 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=106
[INFO] Added calculation, ID=106
[INFO] Processing frame 6 from ./scratch/data/job_gen_6-2024-12-23.xyz...
[INFO] Added structure, ID=10

## Clearing Tables (Be Careful!)

In [2]:
# 0. Removing Duplicate Structures and their associated calculations
# Remove duplicates
duplicates = db_manager.remove_duplicate_structures(
    ltol=0.2,    # Length tolerance
    stol=0.3,    # Site tolerance
    angle_tol=5.0 # Angle tolerance in degrees
)

# Get a summary of what was removed
db_manager.get_duplicate_summary(duplicates)

Checking 162 structures for duplicates...
Found 1 duplicates of structure 1
Found 2 duplicates of structure 2
Found 1 duplicates of structure 3
Found 1 duplicates of structure 4
Found 1 duplicates of structure 5
Found 1 duplicates of structure 6
Found 1 duplicates of structure 7
Found 1 duplicates of structure 8
Found 6 duplicates of structure 9
Found 1 duplicates of structure 10
Found 1 duplicates of structure 11
Found 1 duplicates of structure 12
Found 1 duplicates of structure 13
Found 1 duplicates of structure 14
Found 1 duplicates of structure 15
Found 3 duplicates of structure 16
Found 2 duplicates of structure 17
Found 1 duplicates of structure 18
Found 1 duplicates of structure 19
Found 1 duplicates of structure 20
Found 1 duplicates of structure 21
Found 1 duplicates of structure 22
Found 2 duplicates of structure 23
Found 1 duplicates of structure 24
Found 1 duplicates of structure 25
Found 1 duplicates of structure 26
Found 1 duplicates of structure 27
Found 1 duplicates of 

In [4]:
# 1. Clear/Drop Tables
# -------------------
# BE CAREFUL: This will delete all data!
with db_manager.conn.cursor() as cur:
    cur.execute("""
        DROP TABLE IF EXISTS calculations CASCADE;
        DROP TABLE IF EXISTS structures CASCADE;
    """)
db_manager.conn.commit()
db_manager._initialize_tables()

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


## Viewing Tables

In [41]:

# 2. View Tables
# -------------
# Count total structures and calculations
with db_manager.conn.cursor() as cur:
    cur.execute("SELECT COUNT(*) FROM structures")
    struct_count = cur.fetchone()[0]
    cur.execute("SELECT COUNT(*) FROM calculations")
    calc_count = cur.fetchone()[0]
print(f"Database contains {struct_count} structures and {calc_count} calculations")


Database contains 20195 structures and 20195 calculations


In [42]:

# View recent structures
with db_manager.conn.cursor() as cur:
    cur.execute("""
        SELECT structure_id, formula, composition, metadata->>'generation' as gen 
        FROM structures 
        ORDER BY structure_id DESC 
        LIMIT 5
    """)
    print("\nRecent structures:")
    for row in cur.fetchall():
        print(f"ID: {row[0]}, Formula: {row[1]}, Composition: {row[2]}, Generation: {row[3]}")




Recent structures:
ID: 20195, Formula: Cr2Ti4V112W4Zr2, Composition: {'V': {'at_frac': 0.9032258064516129, 'num_atoms': 112}, 'W': {'at_frac': 0.03225806451612903, 'num_atoms': 4}, 'Cr': {'at_frac': 0.016129032258064516, 'num_atoms': 2}, 'Ti': {'at_frac': 0.03225806451612903, 'num_atoms': 4}, 'Zr': {'at_frac': 0.016129032258064516, 'num_atoms': 2}}, Generation: 1
ID: 20194, Formula: CrTi2V101W19Zr, Composition: {'V': {'at_frac': 0.8145161290322581, 'num_atoms': 101}, 'W': {'at_frac': 0.1532258064516129, 'num_atoms': 19}, 'Cr': {'at_frac': 0.008064516129032258, 'num_atoms': 1}, 'Ti': {'at_frac': 0.016129032258064516, 'num_atoms': 2}, 'Zr': {'at_frac': 0.008064516129032258, 'num_atoms': 1}}, Generation: 1
ID: 20193, Formula: Cr4Ti2V100W12Zr6, Composition: {'V': {'at_frac': 0.8064516129032258, 'num_atoms': 100}, 'W': {'at_frac': 0.0967741935483871, 'num_atoms': 12}, 'Cr': {'at_frac': 0.03225806451612903, 'num_atoms': 4}, 'Ti': {'at_frac': 0.016129032258064516, 'num_atoms': 2}, 'Zr': {'at

In [43]:
# View calculations for a specific structure
db_manager.get_calculations(20191)

[{'id': 20191,
  'model_type': 'vasp-static',
  'energy': [-1034.10503071],
  'forces': array([[-9.229690e-01, -4.506000e-02,  1.344966e+00],
         [ 6.280400e-01,  2.901000e-03,  1.308524e+00],
         [-4.486000e-03,  1.293928e+00,  1.747160e-01],
         [-2.559973e+00,  1.277421e+00,  1.209620e-01],
         [ 6.437550e-01, -6.355900e-02, -5.342630e-01],
         [ 8.895380e-01, -3.517149e+00, -7.620430e-01],
         [-5.480000e-03,  4.239460e-01, -5.936040e-01],
         [ 1.559373e+00, -7.585400e-02, -9.632360e-01],
         [ 1.239175e+00, -7.066690e-01,  8.629400e-02],
         [-1.607800e-02,  8.207010e-01,  7.601330e-01],
         [ 1.130363e+00, -6.038600e-02, -8.498770e-01],
         [ 4.934040e-01,  2.622390e-01,  7.058600e-02],
         [ 8.845900e-02, -2.536150e-01,  1.254863e+00],
         [ 3.617710e-01,  3.201730e-01, -2.382980e-01],
         [-1.630334e+00, -6.509110e-01,  1.855767e+00],
         [-3.037850e-01, -2.719130e-01, -5.943230e-01],
         [-5.97950

## Search Examples

In [17]:
# 3. Search Examples
# ----------------
# Find structures with specific composition with debugging enabled
cr_structures = db_manager.find_structures(
    elements=['Cr'],  # Must contain Cr
    composition_constraints={
        'Cr': (0.1, 1.0)  # At least 1% Cr
    },
    debug=False  # Enable debugging
)

print(db_manager.get_structure(cr_structures[4]))
print(db_manager.get_calculations(cr_structures[4]))

Atoms(symbols='Cr25Ti25V24W25Zr25', pbc=True, cell=[[12.552320703, 0.033758657, -4.406501718], [-6.249853009, 10.8645436, -4.414950182], [0.001908008, -0.0071473, 13.224785071]])
[{'id': 16, 'model_type': 'vasp-static', 'energy': [-1113.77488505], 'forces': array([[-1.329357, -0.645809,  1.809289],
       [-0.191765, -2.395044,  0.301439],
       [ 0.405569,  0.890876,  5.05648 ],
       [ 0.03796 , -1.18591 , -0.895091],
       [ 0.169056,  2.336213,  0.369428],
       [ 1.167045,  0.076883,  0.661512],
       [-0.79992 ,  0.853109, -0.254597],
       [ 3.393114, -0.264754, -0.951341],
       [ 0.703927, -0.063254,  1.029836],
       [-1.047128,  0.833863, -2.079239],
       [ 0.336355, -0.739547,  0.062264],
       [ 2.439446,  1.178822,  1.689913],
       [ 0.405759,  1.055313, -0.808278],
       [ 1.721694,  0.997199,  0.689911],
       [-1.180354, -2.334016,  1.600422],
       [-1.089908,  2.214927, -0.593671],
       [ 0.91121 , -2.152509,  1.148381],
       [ 1.743501,  1.21594 

In [13]:
# Find structures with high forces
high_force_structures = db_manager.find_structures(
    calculation_constraints={
        'model_type': 'vasp-static',
        'forces_max_magnitude': 0.5  # eV/Å
    }
)
print(f"\nFound {len(high_force_structures)} structures with forces >0.5 eV/Å")



TypeError: DatabaseManager.find_structures() got an unexpected keyword argument 'calculation_constraints'

In [14]:
# Find structures from a specific generation
gen5_structures = db_manager.find_structures(
    metadata_constraints={
        'generation': 5
    }
)
print(f"\nFound {len(gen5_structures)} structures from generation 5")

TypeError: DatabaseManager.find_structures() got an unexpected keyword argument 'metadata_constraints'

In [None]:
# Combine multiple constraints
specific_structures = db_manager.find_structures(
    elements=['V', 'Cr'],
    composition_constraints={
        'V': (0.4, 0.6),    # 40-60% V
        'Cr': (0.1, 0.2)    # 10-20% Cr
    },
    metadata_constraints={
        'generation': 5,
        'structure_type': 'vac'
    },
    calculation_constraints={
        'model_type': 'vasp-static',
        'energy': (-1200, -1000)  # Energy range in eV
    }
)
print(f"\nFound {len(specific_structures)} structures matching all criteria")

## Examine Structure Details

In [None]:

# 4. Examine Structure Details
# --------------------------
# Get full details for a specific structure
def examine_structure(db_manager, structure_id):
    """Get detailed information about a structure and its calculations."""
    with db_manager.conn.cursor() as cur:
        # Get structure details
        cur.execute("""
            SELECT formula, composition, metadata
            FROM structures
            WHERE structure_id = %s
        """, (structure_id,))
        row = cur.fetchone()
        if row:
            print(f"\nStructure {structure_id}:")
            print(f"Formula: {row[0]}")
            print(f"Composition: {row[1]}")
            print(f"Metadata: {row[2]}")
            
            # Get associated calculations
            cur.execute("""
                SELECT model_type, energy, stress, metadata
                FROM calculations
                WHERE structure_id = %s
            """, (structure_id,))
            print("\nCalculations:")
            for calc in cur.fetchall():
                print(f"- {calc[0]}: E={calc[1]:.3f} eV")
                if calc[2]:  # stress
                    print(f"  Stress tensor available")
                if calc[3]:  # metadata
                    status = calc[3].get('status', 'unknown')
                    print(f"  Status: {status}")
        else:
            print(f"No structure found with ID {structure_id}")

# Usage example:
if specific_structures:
    examine_structure(db_manager, specific_structures[0])