In [1]:
import sys
sys.path.append('..')
from PHF_RF_code import *

import warnings
warnings.filterwarnings("ignore")

Making Dummy Dataset

In [2]:
import pandas as pd
import numpy as np

In [3]:
def randomise_sides(length, num_of_diff_sides):
    """Generates random sides

    Args:
        length (int): number of values to generate
        num_of_diff_sides (int): number of sides to be different

    Returns:
        sides_df: dataframe with the geenerated sides
    """
    sides_df = pd.DataFrame(columns=['a', 'b', 'c'])
    
    while len(sides_df) < length:
        side_a = round(np.random.uniform(3, 9.0), 3)
        if num_of_diff_sides == 1:
            side_b = side_a
            side_c  = round(np.random.uniform(3, 9.0), 3)
            
            if side_a != side_c:
                sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
            
        elif num_of_diff_sides == 2:
            side_b = round(np.random.uniform(3, 9.0), 3)
            side_c  = round(np.random.uniform(3, 9.0), 3)
            
            if side_a != side_b != side_c:
                sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
                
        else:
            side_b = side_a
            side_c = side_a
            sides_df = sides_df.append({'a': side_a, 'b': side_b, 'c': side_c}, ignore_index=True)
            
    return sides_df

def randomise_angles(length, randomise_1, randomise_3, same_3_randomised):
    """Generates random angles

    Args:
        length (int): number of values to generate
        randomise_1 (boolean): randomise one angle
        randomise_3 (boolean): randomise three angles
        same_3_randomised (boolean): randomise three angles with the same value

    Returns:
        angles: list of angles generated
    """
    if randomise_1 == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            angles.append(ang_a)
            
    elif randomise_3 == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            ang_b = np.random.uniform(45, 80)
            ang_b = round(ang_b, 1)
            ang_y = np.random.uniform(45, 80)
            ang_y = round(ang_y, 1)
            
            if ang_a != ang_b != ang_y:
                angles.append([ang_a, ang_b, ang_y])
                
    elif same_3_randomised == True:
        angles = []
        while len(angles) < length:
            ang_a = np.random.uniform(45, 80)
            ang_a = round(ang_a, 1)
            ang_b = ang_a
            ang_y = ang_a
            
            angles.append([ang_a, ang_b, ang_y])
            
    else:
        print('Please choose a valid option')
        
    return angles

def make_dataset(a, b, c, alpha, beta, gamma, name):
    """Generates a dataset

    Args:
        a (list): values for side a
        b (list): values for side b
        c (list): values for side c
        alpha (list): values for angle alpha
        beta (list): values for angle beta
        gamma (list): values for angle gamma
        name (string): name of the structure

    Returns:
        df: dataframe with the generated dataset
    """
    data = {
        'a': a,
        'b': b,
        'c': c,
        'alpha': alpha,
        'beta': beta,
        'gamma': gamma,
        'structure type': name
        }
    
    df = pd.DataFrame(data)
    return df

# Make and Featurise Datasets
This is with different dataset lengths and unit cell sizes, this is to see if it makes any difference

In [4]:
lengths = [100, 500, 1000, 2000, 5000]
unit_cell_size = [2, 3, 4]
for length in lengths:
    for size in unit_cell_size:
        sides = randomise_sides(length, 0)
        cubic = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "cubic")

        sides = randomise_sides(length, 1)
        tetragonal = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "tetragonal")

        sides = randomise_sides(length, 2)
        orthorhombic = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 90, "orthorhombic")
        
        sides = randomise_sides(length, 1)
        hexagonal = make_dataset(sides["a"], sides["b"], sides["c"], 90, 90, 120, "hexagonal")
        
        sides = randomise_sides(length, 2)
        angles = randomise_angles(length, True, False, False)
        monoclinc = make_dataset(sides["a"], sides["b"], sides["c"], 90, angles, 90, "monoclinc")
        
        sides = randomise_sides(length, 2)
        angles = randomise_angles(length, False, True, False)
        angles = pd.DataFrame(angles, columns=['alpha', 'beta', 'gamma'])
        triclinic = make_dataset(sides["a"], sides["b"], sides["c"],
                                    angles["alpha"], angles["beta"], angles["gamma"], "triclinic")

        sides = randomise_sides(length, 0)
        angles = randomise_angles(length, False, False, True)
        angles = pd.DataFrame(angles, columns=['alpha', 'beta', 'gamma'])
        trigonal = make_dataset(sides["a"], sides["b"], sides["c"], angles["alpha"], 
                                    angles["beta"], angles["gamma"], "trigonal")

        final_df = pd.concat([cubic, tetragonal, orthorhombic, hexagonal, monoclinc, triclinic, trigonal],ignore_index=True)
        
        coordinates = []
        for index, row in final_df.iterrows():
            structure = BravaisLattice(row["a"], row["b"], row["c"], row["alpha"], row["beta"], row["gamma"])
            coordinates.append(structure.get_coords(size))

        features = PresistentHomologyFeatures(coords=coordinates)
        topol_feat_mat, topol_feat_list = features.featurising_coords()
        
        for i, feature in enumerate(topol_feat_mat.T):
            final_df[f"Feature {i}"] = np.squeeze(feature)
        
        final_df.to_csv(f'dummy_datasets/Unit_Cell_Size_{size}_Length_{length*7}.csv', index=False)
        print(f'Unit_Cell_Size_{size}_Length_{length*7}.csv created')

Unit_Cell_Size_2_Length_700.csv created
Unit_Cell_Size_3_Length_700.csv created
Unit_Cell_Size_4_Length_700.csv created
Unit_Cell_Size_2_Length_3500.csv created
Unit_Cell_Size_3_Length_3500.csv created
Unit_Cell_Size_4_Length_3500.csv created
Unit_Cell_Size_2_Length_7000.csv created
Unit_Cell_Size_3_Length_7000.csv created
Unit_Cell_Size_4_Length_7000.csv created
Unit_Cell_Size_2_Length_14000.csv created
Unit_Cell_Size_3_Length_14000.csv created
Unit_Cell_Size_4_Length_14000.csv created
Unit_Cell_Size_2_Length_35000.csv created
Unit_Cell_Size_3_Length_35000.csv created
Unit_Cell_Size_4_Length_35000.csv created
