# Materials preprocessing & featurization basic(Assignment)



## **Step 0 : Data retrieval and filtering**





In [None]:
# Install libraries to use matminer.
!pip install --upgrade matplotlib==3.8.0
!pip install --upgrade pyyaml six matminer[citrine] citrination-client pymatgen

# This will restart the environment, allowing you to re-run the code with the correct versions.
#import os
#os.kill(os.getpid(), 9)

# Ignore the warning message
import warnings
warnings.filterwarnings('ignore')

# Load the required libraries
import numpy as np # a software library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
import pandas as pd # a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series
import matplotlib.pyplot as plt # a plotting library for the Python programming language
import seaborn as sns # Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics

# sklearn is a machine learning software library for the Python programming language. It features various classification, regression and clustering algorithms including support vector machines, random forests, gradient boosting,
# k-means and DBSCAN, and is designed to interoperate with the Python numerical and scientific libraries NumPy and SciPy.

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
# All datasets can be loaded using the load_dataset() function and the database name.
from matminer.datasets import load_dataset

df = load_dataset("dielectric_constant")

In [None]:
# We can get some more detailed information about this dataset using the get_all_dataset_info(<dataset>) function from matminer.
from matminer.datasets import get_all_dataset_info

print(get_all_dataset_info("dielectric_constant"))

## **Step 1: Use the conversions Featurizers in matminer to turn a String composition**

In [None]:
from matminer.featurizers.conversions import StrToComposition

In [None]:
# Add composition-based features
# A major class of featurizers available in matminer uses the chemical composition to featurize the input data.
# Let's add some composition based features to our DataFrame.

# First step : Using the conversions Featurizers in matminer to turn a String composition (our formula column from before) into a pymatgen Composition.
df1 = XXXXX.featurize_dataframe(df, "formula")
df1.head()

Step 2 : Using one of the featurizers in matminer to add a suite of descriptors to the DataFrame.

In [None]:
from matminer.featurizers.composition import ElementProperty

# Add Magpie feature set
ep_feat = XXXXX.from_preset("XXXXXXXXX")
df2 = ep_feat.XXXXX(df1, col_id="composition")
df2.head()

## **Step 3: Add more composition-based featurizers**

In [None]:
from matminer.featurizers.conversions import CompositionToOxidComposition
from matminer.featurizers.composition import OxidationStates

In [None]:
# Add more composition-based features
# There are many more Composition based featurizers apart from ElementProperty that are available in the matminer.featurizers.composition.
# Let's try the ElectronegativityDiff featurizer which requires knowing the oxidation state of the various elements in the Composition.
df3 = XXXXX().XXXXX(df2, "composition")
df3.head()

In [None]:
# Add statistical features of oxidation states
from matminer.featurizers.composition import OxidationStates
os_feat = XXXXX()
df4 = os_feat.XXXXX(df3, "composition_oxid")
df4.head()

## **Step 3: Add Structure based featurizers**

In [None]:
from matminer.featurizers.structure import DensityFeatures

df_feat = XXXXXX()
df5 = df_feat.XXXXX(df4, "structure")
df5.head()

## **Step 4: Select numeric data and perform preprocessing**

In [None]:
import numpy as np

# Select only the numeric columns for correlation
numeric_df = df5.select_dtypes(include=[np.number])
# Remove columns with a single unique value (which don't contribute to correlation)
numeric_df = numeric_df.loc[:, numeric_df.apply(lambda col: col.nunique() != 1)]

# Now plot the heatmap for the correlation matrix of numeric columns
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(numeric_df.corr(), annot=False, cmap="coolwarm", vmin=-1, vmax=1, ax=ax)
plt.show()

In [None]:
def identify_columns(x_new, nrows=10):
    columns = XXXXX.columns
    xvalues = XXXXX.values
    dist = np.linalg.norm(xvalues[:nrows, :, None] - x_new[:nrows, None, :], axis=0)
    return columns[np.argmin(dist, axis=0)].values

In [None]:
# Our target property is Voigt-Reuss-Hill bulk modulus ('K_VRH')
x_data = numeric_df[numeric_df.columns.values]
y_data = numeric_df['poly_total']

In [None]:
# Use SelectKBest to select top 10 features
sel = XXXXXX(f_regression, k=10)
x_new = sel.XXXXXX(x_data, y_data)

# Get the mask of selected features
mask = sel.XXXXX()  # Boolean mask of selected features

# Extract the column names of the selected features
selected_columns = XXXXX.XXXXX[mask]

# Print the selected features using identify_columns
print(f"Selected features: {selected_columns}")

In [None]:
# Visualize the correlation using a bar plot

selected_features_df = XXXXX[XXXXX]
selected_features_df['poly_total'] = XXXXX
correlation_matrix = selected_features_df.XXXXX()
target_correlation = correlation_matrix['poly_total'].drop('XXXXX')

# Print the correlation values
print("Correlation of selected features with the target:")
print(target_correlation)

# Visualize the correlation using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=target_correlation.index, y=target_correlation.values, palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.xlabel('Features')
plt.ylabel('Correlation with Target')
plt.title('Correlation of Selected Features with Target Property')
plt.show()