In [1]:
import sys
sys.path.append('..')
from cleaned_code import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go

In [46]:
def corr_matrix(dataframe, features, width, height, text_size):
    features.append("tc")
    # Compute the correlation matrix
    correlation_matrix = dataframe[features].corr()

    # Get the correlation values
    correlation_values = correlation_matrix.values.round(2)
    
    names_x = [name.replace('_', ' ') for name in correlation_matrix]

    # Create the heatmap trace
    heatmap = go.Heatmap(
        z=correlation_values,
        x=names_x,
        y=names_x,
        colorscale='Viridis', 
        colorbar=dict(title='Pearson Coefficient', titleside='right', tickvals = [-1,-0.5,0,0.5,1], ticktext = [-1,-0.5,0,0.5,1]),
        text=correlation_values,
        texttemplate="%{text}",
        textfont={"size":text_size}
    )
    

    # Create the layout
    layout = go.Layout(
        xaxis=dict(
            title="",
            showline=True,
            linewidth=5,
            linecolor='black',
            ticks='inside',
            tickwidth=4,
            ticklen=5
        ),
        yaxis=dict(
            title="",
            showline=True,
            linewidth=5,
            linecolor='black',
            ticks='inside',
            tickwidth=4,
            ticklen=5
        ),
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=18, color='black'),
        margin=dict(l=5, r=5, b=5, t=10)
    )

    # Create the figure
    fig = go.Figure(data=[heatmap], layout=layout)

    # Show the figure
    fig.show()

# Data Analysis

Just a quick overview

In [4]:
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [5]:
from pymatgen.io.cif import CifParser

for cif_path in df_MP["cif"]:
    parser = CifParser(cif_path)
    structure = parser.get_structures()[0]  # Assuming there's only one structure in the file

    # Get the composition of the structure
    composition = structure.composition

    # Calculate the average atomic weight
    average_atomic_weight = composition.weight
    
    df_MP.loc[df_MP["cif"] == cif_path, "average_atomic_weight"] = average_atomic_weight
df_MP["average_atomic_weight"]

0       445.768964
1       244.683730
2       445.884820
3       241.723020
4       837.818160
           ...    
5768    154.314850
5769    519.120000
5770    130.818000
5771    444.084000
5772    182.448000
Name: average_atomic_weight, Length: 5773, dtype: float64

In [6]:
for column in df_MP.columns:
    print(f"Column '{column}' has type: {df_MP[column].dtype}")

Column 'formula_sc' has type: object
Column 'formula_similarity' has type: int64
Column 'totreldiff' has type: float64
Column 'formula_frac' has type: float64
Column 'correct_formula_frac' has type: bool
Column 'formula' has type: object
Column 'orig_formula_cif' has type: object
Column 'tc' has type: float64
Column 'sc_class' has type: object
Column 'sc_class_unique_sc' has type: bool
Column 'norm_formula_sc' has type: object
Column 'chemical_composition_sc' has type: object
Column 'num_elements_sc' has type: int64
Column 'origin_sc' has type: object
Column 'old_formula_sc' has type: object
Column 'database_id' has type: object
Column 'original_formula' has type: object
Column 'chemical_composition' has type: object
Column 'norm_formula' has type: object
Column 'spacegroup' has type: object
Column 'crystal_system' has type: object
Column 'lata' has type: float64
Column 'latb' has type: float64
Column 'latc' has type: float64
Column 'cif' has type: object
Column 'original_cif' has type

In [7]:
total_dataset_length = len(df_MP)
print(f"Total dataset length: {total_dataset_length}")

Total dataset length: 5773


## Supercon or Not?

In [8]:
# Is there any NA values in the dataset?
print(df_MP['tc'].isna().any())

False


In [9]:
tc_counts = df_MP['tc'].value_counts()
zero_count = tc_counts[0]
non_zero_count = tc_counts[tc_counts.index != 0].sum()

print(f"Count of tc values that are 0: {zero_count}")
print(f"Count of tc values that are not 0: {non_zero_count}")

Count of tc values that are 0: 1778
Count of tc values that are not 0: 3995


In [10]:
df_MP_non_zero = df_MP[df_MP['tc'] != 0]

df_MP.loc[df_MP["tc"] == 0, "sc_class"] = "Not_supercon"

## Supercon Variety

In [11]:
# Define a dictionary to map the old names to the new names
name_mapping = {
    'Other': 'Other',
    'Not_supercon': 'Not Superconductor',
    'Cuprate': 'Cuprate',
    'Ferrite': 'Ferrite',
    'Heavy_fermion': 'Heavy Fermion',
    'Oxide': 'Oxide',
    'Chevrel': 'Chevrel',
    'Carbon': 'Carbon',
    'Heavy_fermionChevrel': 'Heavy Fermion Chevrel',
    'OxideHeavy_fermion': 'Oxide Heavy Fermion'
}

# Replace the values in the sc_class column with the new names
df_MP['sc_class_name'] = df_MP['sc_class'].map(name_mapping)

In [12]:
sc_class_counts = df_MP['sc_class_name'].value_counts()
print(sc_class_counts)

Other                    2512
Not Superconductor       1778
Cuprate                   576
Ferrite                   406
Heavy Fermion             241
Oxide                     181
Chevrel                    54
Carbon                     18
Heavy Fermion Chevrel       6
Oxide Heavy Fermion         1
Name: sc_class_name, dtype: int64


In [13]:
sc_class_counts = sc_class_counts.reindex(['Not Superconductor', 'Other', 'Cuprate', 'Ferrite', 'Heavy Fermion', 'Oxide', 'Chevrel', 'Carbon', 'Heavy Fermion Chevrel', 'Oxide Heavy Fermion'])
precentages = (df_MP["sc_class_name"].value_counts()/total_dataset_length)*100

# Create a bar plot
data = [go.Bar(x=sc_class_counts.index,
               y=sc_class_counts.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color=['rgb(237, 121, 83)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)']))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Superconductor Class", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=4, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=4, ticklen=5), 
                   width=1000,
                   height=800,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=24, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

In [14]:
# Plot the variety of chemicals in other category
df_MP_other = df_MP[df_MP['sc_class'] == 'Other']

We have a few superconducting families here, but a lot of them are within the other section

## Formula Analysis

In [15]:
df_MP[["formula_sc", "formula_similarity", "totreldiff", "formula_frac", "correct_formula_frac", "formula", "orig_formula_cif", "tc"]]

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc
0,Ag0.02Ge2Pd1.98Sr1,2,0.008000,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.640000
1,Ag0.15Sn0.85Te1,3,0.150000,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.150000
2,Ag0.1Ge2Pd1.9Sr1,2,0.040000,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.620000
3,Ag0.1In0.9Te1,3,0.100000,1.0,True,Ag0.1In0.9Te1,In1Te1,1.200000
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.200000
...,...,...,...,...,...,...,...,...
5768,Y1Zn1,1,0.000000,1.0,True,Y1Zn1,Y1Zn1,0.000000
5769,Yb1,1,0.000000,3.0,False,Yb3,Yb3,0.000000
5770,Zn1,1,0.000000,2.0,False,Zn2,Zn2,0.850800
5771,Zn2Zr1,1,0.000000,2.0,False,Zn4Zr2,Zn4Zr2,0.296667


In [16]:
df_MP["totreldiff"].describe()

count    5773.000000
mean        0.032930
std         0.041665
min         0.000000
25%         0.000000
50%         0.013333
75%         0.056000
max         0.150000
Name: totreldiff, dtype: float64

In [17]:
df_MP["formula_frac"].describe()

count    5773.000000
mean        2.246826
std         2.447629
min         0.018180
25%         1.000000
50%         2.000000
75%         2.000000
max        64.000000
Name: formula_frac, dtype: float64

In [18]:
df_MP[df_MP["synth_doped"] == True].groupby("orig_formula_cif").indices

{'Ag1Se2Sn1': array([12, 17, 18, 20, 21, 22, 24], dtype=int64),
 'Ag1Sn1Te2': array([7], dtype=int64),
 'Ag3Al1': array([25], dtype=int64),
 'Ag3Ge1': array([26], dtype=int64),
 'Ag3Hg1': array([14], dtype=int64),
 'Ag3In1': array([16], dtype=int64),
 'Ag3Zn1': array([15], dtype=int64),
 'Ag4Al2': array([ 8,  9, 10, 11, 13], dtype=int64),
 'Ag4Cl6': array([23], dtype=int64),
 'Ag4Hg4': array([6], dtype=int64),
 'Ag6Ga2': array([19], dtype=int64),
 'Al12Ge10': array([161], dtype=int64),
 'Al12Mg17': array([212], dtype=int64),
 'Al12Mg8': array([68], dtype=int64),
 'Al12Mn1': array([145], dtype=int64),
 'Al1B2': array([108], dtype=int64),
 'Al1B4Mg1': array([69, 94, 95], dtype=int64),
 'Al1Ba1Si1': array([130], dtype=int64),
 'Al1Ca1Si1': array([111, 112, 148, 149, 155, 156], dtype=int64),
 'Al1Cr2': array([81], dtype=int64),
 'Al1Ga1Nb6': array([76, 77, 93], dtype=int64),
 'Al1Ga1V6': array([82], dtype=int64),
 'Al1Ge1Nb6': array([ 78,  89,  91,  92,  96,  99, 101, 102, 103, 105, 114], 

In [19]:
df_Ba2Ca1Cu2Tl1O7 = df_MP[df_MP["orig_formula_cif"] == "Ba2Ca1Cu2Tl1O7"]
df_Ba2Ca1Cu2Tl1O7

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight,average_atomic_weight,sc_class_name
1800,Ba2Ca0.7Cu2Gd0.3Tl1O7,3,0.046154,1.0,True,Ba2Ca0.7Cu2Gd0.3Tl1O7,Ba2Ca1Cu2Tl1O7,81.25,Cuprate,True,...,7,0,0,1,0,0,0,1.0,793.3547,Cuprate
1801,Ba2Ca0.7Cu2Nd0.3Tl1O7,3,0.046154,1.0,True,Ba2Ca0.7Cu2Nd0.3Tl1O7,Ba2Ca1Cu2Tl1O7,95.5,Cuprate,True,...,7,0,0,1,0,0,0,1.0,789.4523,Cuprate
1804,Ba2Ca0.8Cu2Gd0.2Tl1O7,3,0.030769,1.0,True,Ba2Ca0.8Cu2Gd0.2Tl1O7,Ba2Ca1Cu2Tl1O7,93.6,Cuprate,True,...,7,0,0,1,0,0,0,1.0,781.6375,Cuprate
1805,Ba2Ca0.8Cu2Nd0.2Tl1O7,3,0.030769,1.0,True,Ba2Ca0.8Cu2Nd0.2Tl1O7,Ba2Ca1Cu2Tl1O7,96.3,Cuprate,True,...,7,0,0,1,0,0,0,1.0,779.0359,Cuprate
1806,Ba2Ca0.8Cu2Tl1Y0.2O7,3,0.030769,1.0,True,Ba2Ca0.8Cu2Tl1Y0.2O7,Ba2Ca1Cu2Tl1O7,97.6,Cuprate,True,...,7,0,0,1,0,0,0,1.0,767.96867,Cuprate
1808,Ba2Ca0.9Cu2Gd0.1Tl1O7,2,0.015385,1.0,True,Ba2Ca0.9Cu2Gd0.1Tl1O7,Ba2Ca1Cu2Tl1O7,91.8,Cuprate,True,...,7,0,0,1,0,0,0,1.0,769.9203,Cuprate
1809,Ba2Ca0.9Cu2Nd0.1Tl1O7,2,0.015385,1.0,True,Ba2Ca0.9Cu2Nd0.1Tl1O7,Ba2Ca1Cu2Tl1O7,90.8,Cuprate,True,...,7,0,0,1,0,0,0,1.0,768.6195,Cuprate
1810,Ba2Ca0.9Cu2Tl1Y0.1O7,2,0.015385,1.0,True,Ba2Ca0.9Cu2Tl1Y0.1O7,Ba2Ca1Cu2Tl1O7,86.1,Cuprate,True,...,7,0,0,1,0,0,0,1.0,763.085885,Cuprate
1824,Ba2Ca1Cu2Tl1O7,1,0.0,1.0,True,Ba2Ca1Cu2Tl1O7,Ba2Ca1Cu2Tl1O7,80.8,Cuprate,True,...,7,0,0,1,0,0,0,1.0,758.2031,Cuprate
1825,Ba2Ca1Cu2Tl1O8,3,0.065934,0.875,True,Ba1.75Ca0.875Cu1.75Tl0.875O7,Ba2Ca1Cu2Tl1O7,105.981667,Cuprate,True,...,7,0,0,1,0,0,0,1.0,677.427188,Cuprate


In [47]:
doping_properties = ["formula_frac", "correct_formula_frac", "formula_similarity", "totreldiff","average_atomic_weight"]

corr_matrix(df_MP_non_zero, doping_properties, 800, 600, 16)

The relative difference allows the differentiation between molecules that are very similar to each other

## Structure Properties

In [21]:
df_MP_non_zero["cat_crystal_system"] = df_MP_non_zero["crystal_system"].astype('category').cat.codes

In [22]:
crystal_properties = ["spacegroup",
                      "crystal_system",
                      "cat_crystal_system",
                      "lata",
                      "latb",
                      "latc",
                      "unit_cell_formula",
                      "cell_volume",
                      "reduced_cell_formula",
                      "primitive",
                      "base-centered",
                      "body-centered",
                      "face-centered",
                      "point_group"]

df_MP_non_zero[crystal_properties]

Unnamed: 0,spacegroup,crystal_system,cat_crystal_system,lata,latb,latc,unit_cell_formula,cell_volume,reduced_cell_formula,primitive,base-centered,body-centered,face-centered,point_group
0,I 4/m m m,tetragonal,4,4.438672,4.438672,6.030548,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",101.453048,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",0,0,1,0,4/mmm
1,F m -3 m,cubic,0,4.537670,4.537670,4.537670,"{'Sn': 1.0, 'Te': 1.0}",66.066855,"{'Sn': 1.0, 'Te': 1.0}",0,0,0,1,m-3m
2,I 4/m m m,tetragonal,4,4.438672,4.438672,6.030548,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",101.453048,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",0,0,1,0,4/mmm
3,F m -3 m,cubic,0,4.443633,4.443633,4.443633,"{'In': 1.0, 'Te': 1.0}",62.043965,"{'In': 1.0, 'Te': 1.0}",0,0,0,1,m-3m
4,P 43 3 2,cubic,0,6.771439,6.771439,6.771439,"{'Ba': 4.0, 'Si': 8.0}",310.486636,"{'Ba': 1.0, 'Si': 2.0}",1,0,0,0,432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5766,F d -3 m,cubic,0,5.422174,5.422174,5.422174,"{'Zr': 2.0, 'W': 4.0}",112.721135,"{'Zr': 1.0, 'W': 2.0}",0,0,0,1,m-3m
5767,P m -3 n,cubic,0,4.980141,4.980141,4.980141,"{'W': 6.0, 'O': 2.0}",123.516483,"{'W': 3.0, 'O': 1.0}",1,0,0,0,m-3m
5770,P 63/m m c,hexagonal,1,2.626730,2.626731,5.207234,{'Zn': 2.0},31.114928,{'Zn': 1.0},1,0,0,0,6/mmm
5771,F d -3 m,cubic,0,5.244424,5.244424,5.244424,"{'Zr': 2.0, 'Zn': 4.0}",101.994903,"{'Zr': 1.0, 'Zn': 2.0}",0,0,0,1,m-3m


In [48]:
corr_matrix(df_MP_non_zero, crystal_properties, 1000, 800, 18)

Really unexpected correlation here with tc I expected volume to have a much higher correlation, I wonder if the latc value here is because of the cuprates?

In [49]:
df_without_cuprates = df_MP_non_zero[df_MP_non_zero['sc_class'] != 'Cuprate']
corr_matrix(df_without_cuprates, crystal_properties, 1000, 800, 18)

Yeah it is! interesting each family should show different imporant features. This probably means each family undergoes a different reason why they present superconductivity.

I did not expect the categorised crystal system to have a lowered correlation to tc after taking cuperates away, its probably important to also include categorising the family type and including it as a property

In [25]:
crystal_system_counts = df_MP_non_zero["crystal_system"].value_counts()
precentages = (df_MP_non_zero["crystal_system"].value_counts()/len(df_MP_non_zero))*100
# Create a bar plot
data = [go.Bar(x=crystal_system_counts.index,
               y=crystal_system_counts.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color='rgb(33, 145, 140)'))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Crystal System", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=4, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=4, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=20, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

## Energy Value and Gaps

In [26]:
electronic_properties = ["e_above_hull",
                         "efermi",
                         "final_energy",
                         "energy",
                         "final_energy_per_atom",
                         "energy_per_atom",
                         "formation_energy_per_atom",
                         "band_gap",
                         "band_structure",
                         "exchange_symmetry"
                         ]

df_MP_non_zero[electronic_properties]

Unnamed: 0,e_above_hull,efermi,final_energy,energy,final_energy_per_atom,energy_per_atom,formation_energy_per_atom,band_gap,band_structure,exchange_symmetry
0,0.000000,4.015543,-24.862722,-24.862722,-4.972544,-4.972544,-0.713482,0.0000,,139
1,0.000000,6.066451,-7.725196,-7.725196,-3.862598,-3.862598,-0.497162,0.6624,,225
2,0.000000,4.015543,-24.862722,-24.862722,-4.972544,-4.972544,-0.713482,0.0000,,139
3,0.000000,6.314060,-6.472687,-6.472687,-3.236343,-3.236343,-0.499851,0.0000,,225
4,0.013441,5.202543,-55.239871,-55.239871,-4.603323,-4.603323,-0.299456,0.0154,,212
...,...,...,...,...,...,...,...,...,...,...
5766,0.000000,5.393069,-69.799473,-69.799473,-11.633246,-11.633246,-0.145261,0.0000,,227
5767,3.105420,6.666585,-40.908650,-40.908650,-5.113581,-5.113581,2.341754,0.0000,,223
5770,0.000000,2.851722,-2.519487,-2.519487,-1.259744,-1.259744,0.000000,0.0000,,194
5771,0.000000,3.547729,-23.949691,-23.949691,-3.991615,-3.991615,-0.302553,0.0000,,227


In [27]:
df_MP[df_MP["band_structure"].notna()]

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight,average_atomic_weight,sc_class_name
116,Al0.3Cr0.7,3,0.066667,3.0,False,Al0.9Cr2.1,Al1Cr2,0.0,Not_supercon,True,...,7,0,0,0,0,1,0,1.0,133.475195,Not Superconductor
1044,B1Pt1,1,0.0,2.0,False,B2Pt2,B2Pt2,0.0,Not_supercon,True,...,0,0,0,1,0,0,0,1.0,411.79,Not Superconductor
4699,Li0.33V2O5,2,0.000868,12.0,False,Li3.96V24O60,Li4V24O60,8.5,Oxide,True,...,0,0,0,1,0,0,0,1.0,2210.04636,Oxide


In [50]:
corr_matrix(df_MP_non_zero, electronic_properties, 1000, 800, 16)

## Magnetic

In [40]:
# Define a dictionary to map the old names to the new names
name_mapping_magnet = {
    'NM': 'Not Magnetic',
    'FM': 'Ferromagnetic  ',
    'FiM': 'Ferrimagnetic',
    'AFM': 'Anti-ferrimagnetic',
}

# Replace the values in the sc_class column with the new names
df_MP["magnetic_type_named"] = df_MP["magnetic_type"].map(name_mapping_magnet)

magnetic_types_count = df_MP["magnetic_type_named"].value_counts()
precentages = (df_MP["magnetic_type_named"].value_counts()/len(df_MP))*100
# Create a bar plot
data = [go.Bar(x=magnetic_types_count.index,
               y=magnetic_types_count.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color='rgb(33, 145, 140)'))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Magnetic Type", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=3, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=5, linecolor='black',
                              ticks='inside', tickwidth=3, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=20, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

In [30]:
df_MP["cat_magnetic_type"] = df_MP["magnetic_type"].astype('category').cat.codes

In [31]:
magentic_properties = ["cat_magnetic_type",
                       "total_magnetization",
                       "is_magnetic",
                       "num_unique_magnetic_sites",
                       "magmoms",
                       "total_magnetization_normalized_vol",
                       "total_magnetization_normalized_formula_units",
                       "num_magnetic_sites",
                       "true_total_magnetization",
                       ]

In [53]:
corr_matrix(df_MP, magentic_properties, 1100, 800, 18)

None of them correlate to TC probably due to the fact that the majority of them are not magentic anyways, although... knowing if it is magentic or not might be useful

## Select K Best

In [33]:
not_features = ['tc','formula_sc', 'formula', 'orig_formula_cif', 'norm_formula_sc', 'chemical_composition_sc','origin_sc', 'old_formula_sc', 'database_id', 'original_formula','chemical_composition', 'norm_formula', 'spacegroup', 
                              'crystal_system','cif', 'original_cif', 'material_id', 'band_structure','created_at',
                              'doi','doi_bibtex','dos','exp','has', 'has_bandstructure', 
                              'icsd_ids', 'last_updated', 'magnetic_type', 'ntask_ids',
                              'original_task_id', 'oxide_type', 'pretty_formula', 'pseudo_potential', 'reduced_cell_formula',
                              'run_type','task_id','task_ids','unit_cell_formula','warnings','ordering','magmoms','origin','cif_before_synthetic_doping',
                              'Reason for exclusion','graph','crystal_temp','no_crystal_temp_given','point_group',"weight",'sc_class_name','magnetic_type_named',
                              "energy_per_atom", "energy", "total_magnetization", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units"]

df_features_all = df_MP.drop(not_features, axis=1)

df_features_all_other = df_features_all[df_MP['sc_class'] == 'Other']
df_Tc_other = df_MP_other['tc']

In [34]:
df_features_all_other = df_features_all_other.drop(['sc_class'], axis=1)
df_features_all = df_features_all.drop(['sc_class'], axis=1)

In [35]:
len(df_features_all.columns)

39

In [36]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

selector = SelectKBest(mutual_info_regression, k=15)
selector.fit(df_features_all_other, df_Tc_other)
# Get columns to keep and create new dataframe with those only
cols_idxs = selector.get_support(indices=True)
features_df_new = df_features_all.iloc[:,cols_idxs]
features_df_new

Unnamed: 0,num_elements_sc,lata,latb,latc,density,e_above_hull,efermi,final_energy,final_energy_per_atom,formation_energy_per_atom,nsites,cell_volume,exchange_symmetry,true_total_magnetization,average_atomic_weight
0,4,4.438672,4.438672,6.030548,7.295677,0.000000,4.015543,-24.862722,-4.972544,-0.713482,5,101.453048,139,0.001259,445.768964
1,3,4.537670,4.537670,4.537670,6.190810,0.000000,6.066451,-7.725196,-3.862598,-0.497162,2,66.066855,225,0.000000,244.683730
2,4,4.438672,4.438672,6.030548,7.295677,0.000000,4.015543,-24.862722,-4.972544,-0.713482,5,101.453048,139,0.001259,445.884820
3,3,4.443633,4.443633,4.443633,6.488053,0.000000,6.314060,-6.472687,-3.236343,-0.499851,2,62.043965,225,0.000182,241.723020
4,3,6.771439,6.771439,6.771439,4.139450,0.013441,5.202543,-55.239871,-4.603323,-0.299456,12,310.486636,212,0.000000,837.818160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5768,2,3.585441,3.585441,3.585441,5.559415,0.000000,3.144927,-8.459703,-4.229851,-0.366744,2,46.092232,221,0.054521,154.314850
5769,1,3.878974,3.878974,9.704523,7.005867,0.000000,1.419946,-4.618825,-1.539608,0.000000,3,123.042457,166,0.001515,519.120000
5770,1,2.626730,2.626731,5.207234,6.981485,0.000000,2.851722,-2.519487,-1.259744,0.000000,2,31.114928,194,0.000637,130.818000
5771,2,5.244424,5.244424,5.244424,7.229958,0.000000,3.547729,-23.949691,-3.991615,-0.302553,6,101.994903,227,1.821118,444.084000


In [37]:
list_of_colums = features_df_new.columns.tolist()
list_of_colums.extend(["totreldiff"])
list_of_colums

['num_elements_sc',
 'lata',
 'latb',
 'latc',
 'density',
 'e_above_hull',
 'efermi',
 'final_energy',
 'final_energy_per_atom',
 'formation_energy_per_atom',
 'nsites',
 'cell_volume',
 'exchange_symmetry',
 'true_total_magnetization',
 'average_atomic_weight',
 'totreldiff']

In [56]:
corr_matrix(df_MP, list_of_colums, 1100, 900, 18)

These with persistent homology will be the features I implement to predict Tc