# Imports
---

In [1]:
import numpy as np 
import pandas as pd

# Cargar Data
___

In [2]:
df_data = pd.read_csv('./dataset.csv')
df_data

Unnamed: 0,Chemical formula,A,B,In literature,Valence A,Valence B,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],...,Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],a [ang],b [ang],c [ang],alpha [deg],beta [deg],gamma [deg],Vacancy energy [eV/O atom]
0,Ac2O3,Ac,Ac,False,element not in BV,element not in BV,1.12,1.12,cubic,-2.732,...,0.000,20.836,0.332,4.705,4.705,4.705,90.0,90.0,90.0,3.150
1,AcAgO3,Ac,Ag,False,element not in BV,element not in BV,1.12,0.95,orthorhombic,-1.957,...,0.000,14.485,0.000,5.779,6.077,8.248,90.0,90.0,90.0,0.817
2,AcAlO3,Ac,Al,False,element not in BV,element not in BV,1.12,0.54,cubic,-3.532,...,0.000,11.487,4.307,3.858,3.858,3.858,90.0,90.0,90.0,6.695
3,AcAsO3,Ac,As,False,element not in BV,element not in BV,1.12,0.52,orthorhombic,-2.398,...,0.000,14.355,0.000,5.780,6.012,8.262,90.0,90.0,90.0,3.634
4,AcAuO3,Ac,Au,False,element not in BV,element not in BV,1.12,0.93,orthorhombic,-2.006,...,0.000,15.190,0.745,5.899,6.750,7.630,90.0,90.0,90.0,0.807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,ZrWO3,Zr,W,False,1,5,0.89,0.62,cubic,-1.637,...,0.339,12.200,0.000,3.936,3.936,3.936,90.0,90.0,90.0,0.191
5325,ZrYO3,Zr,Y,False,not balanced,not balanced,0.89,0.90,cubic,-2.126,...,0.200,15.277,0.000,4.243,4.243,4.243,90.0,90.0,90.0,-4.920
5326,ZrYbO3,Zr,Yb,False,not balanced,not balanced,0.89,0.95,orthorhombic,-3.455,...,0.000,13.136,4.007,5.558,5.726,8.254,90.0,90.0,90.0,-6.177
5327,ZrZnO3,Zr,Zn,False,not balanced,not balanced,0.89,0.74,cubic,-1.630,...,0.001,10.804,0.000,3.780,3.780,3.780,90.0,90.0,90.0,-0.762


# Análisis exploratorio
___

> Comprobar conformidad en los datos (i.e., Dtype) y conocer features

In [3]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5329 entries, 0 to 5328
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Chemical formula            5329 non-null   object 
 1   A                           5329 non-null   object 
 2   B                           5329 non-null   object 
 3   In literature               5329 non-null   bool   
 4   Valence A                   5329 non-null   object 
 5   Valence B                   5329 non-null   object 
 6   Radius A [ang]              5329 non-null   float64
 7   Radius B [ang]              5329 non-null   float64
 8   Lowest distortion           5329 non-null   object 
 9   Formation energy [eV/atom]  5329 non-null   object 
 10  Stability [eV/atom]         5329 non-null   object 
 11  Magnetic moment [mu_B]      5329 non-null   object 
 12  Volume per atom [A^3/atom]  5329 non-null   object 
 13  Band gap [eV]               5329 

> Conocer las categorías a clasificar

In [4]:
df_data['Lowest distortion'].unique()

array(['cubic', 'orthorhombic', 'rhombohedral', 'tetragonal', '-'],
      dtype=object)

> Resumen info de variables numéricas

In [5]:
df_data.describe()

Unnamed: 0,Radius A [ang],Radius B [ang]
count,5329.0,5329.0
mean,0.974656,0.817846
std,0.337139,0.247479
min,0.27,0.27
25%,0.69,0.63
50%,0.93,0.77
75%,1.18,0.95
max,1.88,1.67


# Limpieza de datos
___

> Eliminar categorías no relevantes para la clasificación

- Elimino

In [6]:
df_data = df_data.drop(['Chemical formula', 'In literature', 'a [ang]', 'b [ang]', 'c [ang]', 
'alpha [deg]', 'beta [deg]', 'gamma [deg]'], axis = 1)


- Compruebo

In [7]:
df_data.columns

Index(['A', 'B', 'Valence A', 'Valence B', 'Radius A [ang]', 'Radius B [ang]',
       'Lowest distortion', 'Formation energy [eV/atom]',
       'Stability [eV/atom]', 'Magnetic moment [mu_B]',
       'Volume per atom [A^3/atom]', 'Band gap [eV]',
       'Vacancy energy [eV/O atom]'],
      dtype='object')

> Eliminar datos con categoría desconocida (además, conformidad, pues varios descriptores tienen missing info en esos casos)

- Valido cuantos hay

In [8]:
(df_data['Lowest distortion'] == '-').sum()

53

- Los elimino

In [9]:
#obtengo indices
lost_data_indexes = []

for index, row in df_data.iterrows():
    if (row['Lowest distortion'] == '-'):
        lost_data_indexes.append(index)
    else:
        continue

# elimino filas con esos indices
df_data = df_data.drop(lost_data_indexes)

- Compruebo que ya no esten

In [10]:
print((df_data['Lowest distortion'] == '-').sum())
df_data

0


Unnamed: 0,A,B,Valence A,Valence B,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],Stability [eV/atom],Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],Vacancy energy [eV/O atom]
0,Ac,Ac,element not in BV,element not in BV,1.12,1.12,cubic,-2.732,0.848,0.000,20.836,0.332,3.150
1,Ac,Ag,element not in BV,element not in BV,1.12,0.95,orthorhombic,-1.957,-0.055,0.000,14.485,0.000,0.817
2,Ac,Al,element not in BV,element not in BV,1.12,0.54,cubic,-3.532,-0.110,0.000,11.487,4.307,6.695
3,Ac,As,element not in BV,element not in BV,1.12,0.52,orthorhombic,-2.398,0.224,0.000,14.355,0.000,3.634
4,Ac,Au,element not in BV,element not in BV,1.12,0.93,orthorhombic,-2.006,-0.056,0.000,15.190,0.745,0.807
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,Zr,W,1,5,0.89,0.62,cubic,-1.637,1.196,0.339,12.200,0.000,0.191
5325,Zr,Y,not balanced,not balanced,0.89,0.90,cubic,-2.126,1.422,0.200,15.277,0.000,-4.920
5326,Zr,Yb,not balanced,not balanced,0.89,0.95,orthorhombic,-3.455,0.205,0.000,13.136,4.007,-6.177
5327,Zr,Zn,not balanced,not balanced,0.89,0.74,cubic,-1.630,1.210,0.001,10.804,0.000,-0.762


> Ajustar tipos de datos (Asegurar conformidad)

- Verifico ahora que features carecen conformidad

In [11]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5276 entries, 0 to 5328
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   A                           5276 non-null   object 
 1   B                           5276 non-null   object 
 2   Valence A                   5276 non-null   object 
 3   Valence B                   5276 non-null   object 
 4   Radius A [ang]              5276 non-null   float64
 5   Radius B [ang]              5276 non-null   float64
 6   Lowest distortion           5276 non-null   object 
 7   Formation energy [eV/atom]  5276 non-null   object 
 8   Stability [eV/atom]         5276 non-null   object 
 9   Magnetic moment [mu_B]      5276 non-null   object 
 10  Volume per atom [A^3/atom]  5276 non-null   object 
 11  Band gap [eV]               5276 non-null   object 
 12  Vacancy energy [eV/O atom]  5276 non-null   object 
dtypes: float64(2), object(11)
memory 

- Cambio tipos de datos en features incoherentes no problemáticos

In [12]:
df_data.columns

for col in df_data.columns:
    not_to_format = ['A','B', 'Valence A', 'Valence B', 'Lowest distortion', 'Magnetic moment [mu_B]', 'Vacancy energy [eV/O atom]']
    if(col not in not_to_format):
        df_data[col] = pd.to_numeric(df_data[col], errors = 'coerce')
    else:
        continue

df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5276 entries, 0 to 5328
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   A                           5276 non-null   object 
 1   B                           5276 non-null   object 
 2   Valence A                   5276 non-null   object 
 3   Valence B                   5276 non-null   object 
 4   Radius A [ang]              5276 non-null   float64
 5   Radius B [ang]              5276 non-null   float64
 6   Lowest distortion           5276 non-null   object 
 7   Formation energy [eV/atom]  5276 non-null   float64
 8   Stability [eV/atom]         5276 non-null   float64
 9   Magnetic moment [mu_B]      5276 non-null   object 
 10  Volume per atom [A^3/atom]  5276 non-null   float64
 11  Band gap [eV]               5276 non-null   float64
 12  Vacancy energy [eV/O atom]  5276 non-null   object 
dtypes: float64(6), object(7)
memory u

- Analizo features problemáticos y decido

In [13]:
# En magnetic moment y vacancy energy, vamos a ver cuantos missing elements hay
print((df_data['Magnetic moment [mu_B]'] == '-').sum())
print((df_data['Vacancy energy [eV/O atom]'] == '-').sum())


960
362


- Cambio tipos de datos en features incoherentes problemáticos (voy a borrar missing registers)

In [14]:
df_data['Magnetic moment [mu_B]'] = pd.to_numeric(df_data['Magnetic moment [mu_B]'], errors = 'coerce')
df_data['Vacancy energy [eV/O atom]'] = pd.to_numeric(df_data['Vacancy energy [eV/O atom]'], errors = 'coerce')

df_data

Unnamed: 0,A,B,Valence A,Valence B,Radius A [ang],Radius B [ang],Lowest distortion,Formation energy [eV/atom],Stability [eV/atom],Magnetic moment [mu_B],Volume per atom [A^3/atom],Band gap [eV],Vacancy energy [eV/O atom]
0,Ac,Ac,element not in BV,element not in BV,1.12,1.12,cubic,-2.732,0.848,0.000,20.836,0.332,3.150
1,Ac,Ag,element not in BV,element not in BV,1.12,0.95,orthorhombic,-1.957,-0.055,0.000,14.485,0.000,0.817
2,Ac,Al,element not in BV,element not in BV,1.12,0.54,cubic,-3.532,-0.110,0.000,11.487,4.307,6.695
3,Ac,As,element not in BV,element not in BV,1.12,0.52,orthorhombic,-2.398,0.224,0.000,14.355,0.000,3.634
4,Ac,Au,element not in BV,element not in BV,1.12,0.93,orthorhombic,-2.006,-0.056,0.000,15.190,0.745,0.807
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5324,Zr,W,1,5,0.89,0.62,cubic,-1.637,1.196,0.339,12.200,0.000,0.191
5325,Zr,Y,not balanced,not balanced,0.89,0.90,cubic,-2.126,1.422,0.200,15.277,0.000,-4.920
5326,Zr,Yb,not balanced,not balanced,0.89,0.95,orthorhombic,-3.455,0.205,0.000,13.136,4.007,-6.177
5327,Zr,Zn,not balanced,not balanced,0.89,0.74,cubic,-1.630,1.210,0.001,10.804,0.000,-0.762


> Eliminar Nulos/ Nans

- Compruebo Nulos

In [15]:
df_data.isnull().sum()

A                               0
B                               0
Valence A                       0
Valence B                       0
Radius A [ang]                  0
Radius B [ang]                  0
Lowest distortion               0
Formation energy [eV/atom]      0
Stability [eV/atom]             0
Magnetic moment [mu_B]        960
Volume per atom [A^3/atom]      0
Band gap [eV]                   0
Vacancy energy [eV/O atom]    362
dtype: int64

- Elimino Nulos

In [16]:
df_data.dropna(inplace=True)

- Compruebo y veo con cuántos registros quedé

In [17]:
#La rsta da 3954, había algunos con nan en ambos features
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3967 entries, 0 to 5328
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   A                           3967 non-null   object 
 1   B                           3967 non-null   object 
 2   Valence A                   3967 non-null   object 
 3   Valence B                   3967 non-null   object 
 4   Radius A [ang]              3967 non-null   float64
 5   Radius B [ang]              3967 non-null   float64
 6   Lowest distortion           3967 non-null   object 
 7   Formation energy [eV/atom]  3967 non-null   float64
 8   Stability [eV/atom]         3967 non-null   float64
 9   Magnetic moment [mu_B]      3967 non-null   float64
 10  Volume per atom [A^3/atom]  3967 non-null   float64
 11  Band gap [eV]               3967 non-null   float64
 12  Vacancy energy [eV/O atom]  3967 non-null   float64
dtypes: float64(8), object(5)
memory u

> Eliminar registros duplicados

- Eliminar duplicados

In [18]:
df_data.drop_duplicates(inplace = True)

- Comprobar si algún registro fue eliminado

In [19]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3967 entries, 0 to 5328
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   A                           3967 non-null   object 
 1   B                           3967 non-null   object 
 2   Valence A                   3967 non-null   object 
 3   Valence B                   3967 non-null   object 
 4   Radius A [ang]              3967 non-null   float64
 5   Radius B [ang]              3967 non-null   float64
 6   Lowest distortion           3967 non-null   object 
 7   Formation energy [eV/atom]  3967 non-null   float64
 8   Stability [eV/atom]         3967 non-null   float64
 9   Magnetic moment [mu_B]      3967 non-null   float64
 10  Volume per atom [A^3/atom]  3967 non-null   float64
 11  Band gap [eV]               3967 non-null   float64
 12  Vacancy energy [eV/O atom]  3967 non-null   float64
dtypes: float64(8), object(5)
memory u

> Normalizar variables categóricas

- Comprobar si ciertas categorías de cada variable categórica están normalizadas

In [20]:
df_data['Valence A'].unique()

array(['element not in BV', 'not balanced', '1', '3', '2', '4', '5'],
      dtype=object)

In [21]:
df_data['Valence B'].unique()

array(['element not in BV', 'not balanced', '5', '3', '4', '2', '1'],
      dtype=object)

In [22]:
df_data['A'].unique()    

array(['Ac', 'Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca', 'Cd',
       'Ce', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'Fe', 'Ga', 'Gd',
       'Ge', 'Hf', 'Hg', 'Ho', 'In', 'Ir', 'K', 'La', 'Li', 'Lu', 'Mg',
       'Mn', 'Mo', 'Na', 'Nb', 'Nd', 'Ni', 'Np', 'Os', 'Pa', 'Pb', 'Pd',
       'Pm', 'Pr', 'Pt', 'Pu', 'Rb', 'Re', 'Rh', 'Ru', 'Sb', 'Sc', 'Si',
       'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm',
       'U', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr'], dtype=object)

In [23]:
df_data['B'].unique()

array(['Ac', 'Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Ca', 'Cd',
       'Ce', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'Fe', 'Ga', 'Gd',
       'Ge', 'Hf', 'Hg', 'Ho', 'In', 'Ir', 'K', 'La', 'Li', 'Lu', 'Mn',
       'Mo', 'Nb', 'Nd', 'Ni', 'Np', 'Os', 'Pa', 'Pb', 'Pd', 'Pm', 'Pr',
       'Pt', 'Pu', 'Rb', 'Re', 'Rh', 'Ru', 'Sb', 'Sc', 'Si', 'Sm', 'Sn',
       'Sr', 'Ta', 'Tb', 'Tc', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V',
       'W', 'Y', 'Yb', 'Zn', 'Zr', 'Mg', 'Na'], dtype=object)

# Exportar dataset clean
___

In [24]:
df_data.to_csv("dataset_clean.csv", sep=',')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=57d44cd8-e30a-4be5-b9fd-7812e0bccc78' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>