In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go

In [2]:
def corr_matrix(dataframe, features, width, height, text_size):
    features.append("tc")
    # Compute the correlation matrix
    correlation_matrix = dataframe[features].corr()

    # Get the correlation values
    correlation_values = correlation_matrix.values.round(2)

    # Create the heatmap trace
    heatmap = go.Heatmap(
        z=correlation_values,
        x=correlation_matrix.columns,
        y=correlation_matrix.index,
        colorscale='Viridis', 
        colorbar=dict(title='Pearson Coefficient', titleside='right', tickvals = [-1,-0.5,0,0.5,1], ticktext = [-1,-0.5,0,0.5,1]),
        text=correlation_values,
        texttemplate="%{text}",
        textfont={"size":text_size}
    )
    

    # Create the layout
    layout = go.Layout(
        xaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        yaxis=dict(
            title="",
            showline=True,
            linewidth=2,
            linecolor='black',
            ticks='inside',
            tickwidth=2,
            ticklen=5
        ),
        width=width,
        height=height,
        plot_bgcolor='white',
        paper_bgcolor='white',
        font=dict(family='Helvetica', size=16, color='black'),
        margin=dict(l=5, r=5, b=5, t=10)
    )

    # Create the figure
    fig = go.Figure(data=[heatmap], layout=layout)

    # Show the figure
    fig.show()

# Data Analysis

Just a quick overview

In [3]:
df_MP = pd.read_csv("3DSC_MP.csv", skiprows=1)
df_MP['cif'] = df_MP['cif'].str.replace('data/final/MP/', '')
df_MP.columns = df_MP.columns.str.replace('_2', '')
df_MP.head()

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,monoclinic,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight
0,Ag0.02Ge2Pd1.98Sr1,2,0.008,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.64,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
1,Ag0.15Sn0.85Te1,3,0.15,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.15,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
2,Ag0.1Ge2Pd1.9Sr1,2,0.04,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.62,Other,True,...,0,0,7,0,0,0,0,1,0,1.0
3,Ag0.1In0.9Te1,3,0.1,1.0,True,Ag0.1In0.9Te1,In1Te1,1.2,Other,True,...,0,0,0,0,0,0,0,0,1,1.0
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.2,Other,True,...,0,0,0,0,0,1,0,0,0,1.0


In [4]:
for column in df_MP.columns:
    print(f"Column '{column}' has type: {df_MP[column].dtype}")

Column 'formula_sc' has type: object
Column 'formula_similarity' has type: int64
Column 'totreldiff' has type: float64
Column 'formula_frac' has type: float64
Column 'correct_formula_frac' has type: bool
Column 'formula' has type: object
Column 'orig_formula_cif' has type: object
Column 'tc' has type: float64
Column 'sc_class' has type: object
Column 'sc_class_unique_sc' has type: bool
Column 'norm_formula_sc' has type: object
Column 'chemical_composition_sc' has type: object
Column 'num_elements_sc' has type: int64
Column 'origin_sc' has type: object
Column 'old_formula_sc' has type: object
Column 'database_id' has type: object
Column 'original_formula' has type: object
Column 'chemical_composition' has type: object
Column 'norm_formula' has type: object
Column 'spacegroup' has type: object
Column 'crystal_system' has type: object
Column 'lata' has type: float64
Column 'latb' has type: float64
Column 'latc' has type: float64
Column 'cif' has type: object
Column 'original_cif' has type

In [5]:
total_dataset_length = len(df_MP)
print(f"Total dataset length: {total_dataset_length}")

Total dataset length: 5773


## Supercon or Not?

In [6]:
# Is there any NA values in the dataset?
print(df_MP['tc'].isna().any())

False


In [7]:
tc_counts = df_MP['tc'].value_counts()
zero_count = tc_counts[0]
non_zero_count = tc_counts[tc_counts.index != 0].sum()

print(f"Count of tc values that are 0: {zero_count}")
print(f"Count of tc values that are not 0: {non_zero_count}")

Count of tc values that are 0: 1778
Count of tc values that are not 0: 3995


In [8]:
df_MP_non_zero = df_MP[df_MP['tc'] != 0]

df_MP.loc[df_MP["tc"] == 0, "sc_class"] = "Not_supercon"

## Supercon Variety

In [9]:
# Define a dictionary to map the old names to the new names
name_mapping = {
    'Other': 'Other',
    'Not_supercon': 'Not Superconductor',
    'Cuprate': 'Cuprate',
    'Ferrite': 'Ferrite',
    'Heavy_fermion': 'Heavy Fermion',
    'Oxide': 'Oxide',
    'Chevrel': 'Chevrel',
    'Carbon': 'Carbon',
    'Heavy_fermionChevrel': 'Heavy Fermion Chevrel',
    'OxideHeavy_fermion': 'Oxide Heavy Fermion'
}

# Replace the values in the sc_class column with the new names
df_MP['sc_class_name'] = df_MP['sc_class'].map(name_mapping)

In [10]:
sc_class_counts = df_MP['sc_class_name'].value_counts()
print(sc_class_counts)

Other                    2512
Not Superconductor       1778
Cuprate                   576
Ferrite                   406
Heavy Fermion             241
Oxide                     181
Chevrel                    54
Carbon                     18
Heavy Fermion Chevrel       6
Oxide Heavy Fermion         1
Name: sc_class_name, dtype: int64


In [11]:
sc_class_counts = sc_class_counts.reindex(['Not Superconductor', 'Other', 'Cuprate', 'Ferrite', 'Heavy Fermion', 'Oxide', 'Chevrel', 'Carbon', 'Heavy Fermion Chevrel', 'Oxide Heavy Fermion'])
precentages = (df_MP["sc_class_name"].value_counts()/total_dataset_length)*100

# Create a bar plot
data = [go.Bar(x=sc_class_counts.index,
               y=sc_class_counts.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color=['rgb(237, 121, 83)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)', 'rgb(33, 145, 140)']))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Superconductor Class", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

In [12]:
# Plot the variety of chemicals in other category
df_MP_other = df_MP[df_MP['sc_class'] == 'Other']

We have a few superconducting families here, but a lot of them are within the other section

## Formula Analysis

In [13]:
df_MP[["formula_sc", "formula_similarity", "totreldiff", "formula_frac", "correct_formula_frac", "formula", "orig_formula_cif", "tc"]]

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc
0,Ag0.02Ge2Pd1.98Sr1,2,0.008000,1.0,True,Ag0.02Ge2Pd1.98Sr1,Ge2Pd2Sr1,2.640000
1,Ag0.15Sn0.85Te1,3,0.150000,1.0,True,Ag0.15Sn0.85Te1,Sn1Te1,2.150000
2,Ag0.1Ge2Pd1.9Sr1,2,0.040000,1.0,True,Ag0.1Ge2Pd1.9Sr1,Ge2Pd2Sr1,2.620000
3,Ag0.1In0.9Te1,3,0.100000,1.0,True,Ag0.1In0.9Te1,In1Te1,1.200000
4,Ag0.2Ba1Si1.8,3,0.133333,4.0,False,Ag0.8Ba4Si7.2,Ba4Si8,3.200000
...,...,...,...,...,...,...,...,...
5768,Y1Zn1,1,0.000000,1.0,True,Y1Zn1,Y1Zn1,0.000000
5769,Yb1,1,0.000000,3.0,False,Yb3,Yb3,0.000000
5770,Zn1,1,0.000000,2.0,False,Zn2,Zn2,0.850800
5771,Zn2Zr1,1,0.000000,2.0,False,Zn4Zr2,Zn4Zr2,0.296667


In [14]:
df_MP["totreldiff"].describe()

count    5773.000000
mean        0.032930
std         0.041665
min         0.000000
25%         0.000000
50%         0.013333
75%         0.056000
max         0.150000
Name: totreldiff, dtype: float64

A pretty small total relevtive different in terms of crystal structures, so I will not be removing anything based on this

## Structure Properties

In [15]:
df_MP_non_zero["cat_crystal_system"] = df_MP_non_zero["crystal_system"].astype('category').cat.codes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
crystal_properties = ["spacegroup",
                      "crystal_system",
                      "cat_crystal_system",
                      "lata",
                      "latb",
                      "latc",
                      "unit_cell_formula",
                      "cell_volume",
                      "reduced_cell_formula",
                      "primitive",
                      "base-centered",
                      "body-centered",
                      "face-centered",
                      "point_group"
                      ]

df_MP_non_zero[crystal_properties]

Unnamed: 0,spacegroup,crystal_system,cat_crystal_system,lata,latb,latc,unit_cell_formula,cell_volume,reduced_cell_formula,primitive,base-centered,body-centered,face-centered,point_group
0,I 4/m m m,tetragonal,4,4.438672,4.438672,6.030548,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",101.453048,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",0,0,1,0,4/mmm
1,F m -3 m,cubic,0,4.537670,4.537670,4.537670,"{'Sn': 1.0, 'Te': 1.0}",66.066855,"{'Sn': 1.0, 'Te': 1.0}",0,0,0,1,m-3m
2,I 4/m m m,tetragonal,4,4.438672,4.438672,6.030548,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",101.453048,"{'Sr': 1.0, 'Ge': 2.0, 'Pd': 2.0}",0,0,1,0,4/mmm
3,F m -3 m,cubic,0,4.443633,4.443633,4.443633,"{'In': 1.0, 'Te': 1.0}",62.043965,"{'In': 1.0, 'Te': 1.0}",0,0,0,1,m-3m
4,P 43 3 2,cubic,0,6.771439,6.771439,6.771439,"{'Ba': 4.0, 'Si': 8.0}",310.486636,"{'Ba': 1.0, 'Si': 2.0}",1,0,0,0,432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5766,F d -3 m,cubic,0,5.422174,5.422174,5.422174,"{'Zr': 2.0, 'W': 4.0}",112.721135,"{'Zr': 1.0, 'W': 2.0}",0,0,0,1,m-3m
5767,P m -3 n,cubic,0,4.980141,4.980141,4.980141,"{'W': 6.0, 'O': 2.0}",123.516483,"{'W': 3.0, 'O': 1.0}",1,0,0,0,m-3m
5770,P 63/m m c,hexagonal,1,2.626730,2.626731,5.207234,{'Zn': 2.0},31.114928,{'Zn': 1.0},1,0,0,0,6/mmm
5771,F d -3 m,cubic,0,5.244424,5.244424,5.244424,"{'Zr': 2.0, 'Zn': 4.0}",101.994903,"{'Zr': 1.0, 'Zn': 2.0}",0,0,0,1,m-3m


In [17]:
corr_matrix(df_MP_non_zero, crystal_properties, 800, 600, 14)

Really unexpected correlation here with tc I expected volume to have a much higher correlation, I wonder if the latc value here is because of the cuprates?

In [18]:
df_without_cuprates = df_MP_non_zero[df_MP_non_zero['sc_class'] != 'Cuprate']
corr_matrix(df_without_cuprates, crystal_properties, 800, 600, 14)

Yeah it is! interesting each family should show different imporant features. This probably means each family undergoes a different reason why they present superconductivity.

I did not expect the categorised crystal system to have a lowered correlation to tc after taking cuperates away, its probably important to also include categorising the family type and including it as a property

In [19]:
crystal_system_counts = df_MP_non_zero["crystal_system"].value_counts()
precentages = (df_MP_non_zero["crystal_system"].value_counts()/len(df_MP_non_zero))*100
# Create a bar plot
data = [go.Bar(x=crystal_system_counts.index,
               y=crystal_system_counts.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color='rgb(33, 145, 140)'))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Crystal System", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

## Energy Value and Gaps

In [20]:
electronic_properties = ["e_above_hull",
                         "efermi",
                         "final_energy",
                         "final_energy_per_atom",
                         "formation_energy_per_atom",
                         "band_gap",
                         "band_structure",
                         "exchange_symmetry"
                         ]

df_MP_non_zero[electronic_properties]

Unnamed: 0,e_above_hull,efermi,final_energy,final_energy_per_atom,formation_energy_per_atom,band_gap,band_structure,exchange_symmetry
0,0.000000,4.015543,-24.862722,-4.972544,-0.713482,0.0000,,139
1,0.000000,6.066451,-7.725196,-3.862598,-0.497162,0.6624,,225
2,0.000000,4.015543,-24.862722,-4.972544,-0.713482,0.0000,,139
3,0.000000,6.314060,-6.472687,-3.236343,-0.499851,0.0000,,225
4,0.013441,5.202543,-55.239871,-4.603323,-0.299456,0.0154,,212
...,...,...,...,...,...,...,...,...
5766,0.000000,5.393069,-69.799473,-11.633246,-0.145261,0.0000,,227
5767,3.105420,6.666585,-40.908650,-5.113581,2.341754,0.0000,,223
5770,0.000000,2.851722,-2.519487,-1.259744,0.000000,0.0000,,194
5771,0.000000,3.547729,-23.949691,-3.991615,-0.302553,0.0000,,227


In [21]:
df_MP[df_MP["band_structure"].notna()]

Unnamed: 0,formula_sc,formula_similarity,totreldiff,formula_frac,correct_formula_frac,formula,orig_formula_cif,tc,sc_class,sc_class_unique_sc,...,orthorhombic,tetragonal,triclinic,trigonal,primitive,base-centered,body-centered,face-centered,weight,sc_class_name
116,Al0.3Cr0.7,3,0.066667,3.0,False,Al0.9Cr2.1,Al1Cr2,0.0,Not_supercon,True,...,0,7,0,0,0,0,1,0,1.0,Not Superconductor
1044,B1Pt1,1,0.0,2.0,False,B2Pt2,B2Pt2,0.0,Not_supercon,True,...,0,0,0,0,1,0,0,0,1.0,Not Superconductor
4699,Li0.33V2O5,2,0.000868,12.0,False,Li3.96V24O60,Li4V24O60,8.5,Oxide,True,...,0,0,0,0,1,0,0,0,1.0,Oxide


In [22]:
corr_matrix(df_MP_non_zero, electronic_properties, 800, 600, 14)

## Magnetic

In [23]:
# Define a dictionary to map the old names to the new names
name_mapping_magnet = {
    'NM': 'Not Magnetic',
    'FM': 'Ferromagnetic  ',
    'FiM': 'Ferrimagnetic',
    'AFM': 'Anti-ferrimagnetic',
}

# Replace the values in the sc_class column with the new names
df_MP["magnetic_type_named"] = df_MP["magnetic_type"].map(name_mapping_magnet)

magnetic_types_count = df_MP["magnetic_type_named"].value_counts()
precentages = (df_MP["magnetic_type_named"].value_counts()/len(df_MP))*100
# Create a bar plot
data = [go.Bar(x=magnetic_types_count.index,
               y=magnetic_types_count.values,
               text= [f'{x:.2f}%' for x in precentages],
               textposition='auto',
               marker = dict(color='rgb(33, 145, 140)'))]

# Create the layout
layout = go.Layout(xaxis=dict(title="Magnetic Type", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5),
                   yaxis=dict(title="Count", showline=True, linewidth=2, linecolor='black',
                              ticks='inside', tickwidth=2, ticklen=5), 
                   width=800,
                   height=500,
                   plot_bgcolor='white',
                   paper_bgcolor='white', 
                   font=dict(family='Helvetica', size=16, color='black'),
                   margin=dict(l=10, r=10, b=10, t=10))

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the figure
fig.show()

In [24]:
df_MP["cat_magnetic_type"] = df_MP["magnetic_type"].astype('category').cat.codes

In [25]:
magentic_properties = ["cat_magnetic_type",
                       "total_magnetization",
                       "is_magnetic",
                       "num_unique_magnetic_sites",
                       "magmoms",
                       "total_magnetization_normalized_vol",
                       "total_magnetization_normalized_formula_units",
                       "num_magnetic_sites",
                       "true_total_magnetization",
                       ]

In [26]:
corr_matrix(df_MP, magentic_properties, 1000, 800, 14)

None of them correlate to TC probably due to the fact that the majority of them are not magentic anyways