In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [4]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
MY_embeddings = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # 加载一个轻量级的 SBERT 模型

  from tqdm.autonotebook import tqdm, trange


# 这些指标将被用作预测
 - energy_grav: 比能量（重量比），通常单位为Wh/kg。 
 - energy_vol: 体积比能量，通常单位为Wh/L。
 - stability_charge: 充电时的稳定性指标。
 - stability_discharge: 放电时的稳定性指标。

In [5]:
df =  pd.read_csv("battery_material_li.csv")
df.head(4)

Unnamed: 0,battery_id,battery_formula,working_ion,max_voltage_step,elements,nelements,max_delta_volume,average_voltage,capacity_grav,capacity_vol,energy_grav,energy_vol,stability_charge,stability_discharge,material_ids
0,mp-776296_Li,Li0-1MnPO4,Li,0.0,"[Element Mn, Element P, Element O]",3,0.106409,3.517563,170.872882,583.222092,601.05617,2051.52059,0.0,0.037746,"[MPID(mp-777460), MPID(mp-776296)]"
1,mp-26607_Li,Li0-1MnPO4,Li,0.257336,"[Element Mn, Element P, Element O]",3,0.069917,4.307594,170.872882,566.742462,736.050942,2441.296226,0.130776,0.036979,"[MPID(mp-697761), MPID(mp-755395), MPID(mp-266..."
2,mp-754310_Li,Li0-1Fe5(OF2)4,Li,0.0,"[Element Fe, Element O, Element F]",3,0.041918,4.702296,53.373369,218.995116,250.977395,1029.779927,0.075075,0.050619,"[MPID(mp-754310), MPID(mp-1176720)]"
3,mp-771388_Li,Li0-2MnP2O7,Li,0.447935,"[Element Mn, Element P, Element O]",3,0.165571,3.965911,220.803338,615.484551,875.686392,2440.95696,0.006474,0.029376,"[MPID(mp-771388), MPID(mp-776412), MPID(mp-775..."


In [16]:
# 使用文本预训嵌入将文本数据编码为数组，三个优化方向  1.使用量化的模型提升速度  2.使用用smiles训练的嵌入应该会有更好的性能 3.其它编码方式，这里可以直接独热编码
# 此外这里可以改成池化啥的
def embedding_pca(iter_x,pca_new_dim = 16):
    
    iter_x = MY_embeddings.encode(iter_x)
    # 定义新的维度，例如降到128维
    print(iter_x.shape)
    pca = PCA(n_components=pca_new_dim)
    return pca.fit_transform(iter_x)
battery_formula_embeddings = embedding_pca(df["battery_formula"].values)
elements_embeddings = embedding_pca(df["elements"].values)
working_ion_embeddings = embedding_pca(df["working_ion"].values)


# 第一部分输入  这一部分可以不再作回归分析直接输入到模型里
battery_elements_working_ion_embeddings = np.concatenate([battery_formula_embeddings,elements_embeddings,working_ion_embeddings],axis = 1)

# 第二部分输入  这一部分应该做一些显著性的相关分析
other_X = df[["nelements","max_delta_volume","average_voltage","capacity_grav","capacity_vol","max_voltage_step"]].values

(2440, 384)
(2440, 384)
(2440, 384)


In [17]:
from sklearn.metrics import r2_score

from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE

# 分割数据集
y = df['energy_vol']
X_train, X_test, y_train, y_test = train_test_split(battery_elements_working_ion_embeddings, y, test_size=0.2, random_state=42)

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 拟合岭回归模型并进行交叉验证选择最优alpha
ridge_cv = RidgeCV(alphas=np.logspace(-6, 6, 13))
ridge_cv.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_cv.predict(X_test_scaled)
print("Ridge Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))
print("Best alpha for Ridge:", ridge_cv.alpha_)
print("Ridge Regression R² Score:", r2_score(y_test, y_pred_ridge))

# 拟合LASSO回归模型并进行交叉验证选择最优alpha
lasso_cv = LassoCV(alphas=None, cv=5, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_cv.predict(X_test_scaled)
print("LASSO Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lasso)))
print("Best alpha for LASSO:", lasso_cv.alpha_)
print("LASSO Regression R² Score:", r2_score(y_test, y_pred_lasso))

# 输出系数
print("Ridge Coefficients:", ridge_cv.coef_)
print("LASSO Coefficients:", lasso_cv.coef_)

Ridge Regression RMSE: 1056.1699579297908
Best alpha for Ridge: 100.0
Ridge Regression R² Score: 0.19690601544916497
LASSO Regression RMSE: 1058.4643729611912
Best alpha for LASSO: 10.079960428604666
LASSO Regression R² Score: 0.19341295573062955
Ridge Coefficients: [  35.02178305   14.7332948  -143.02798895  -34.96373329   72.15891293
  -81.51769426   17.75258481  -44.30563133 -122.14597953  -10.49648613
 -282.19022277 -102.80816832  -24.85890358   29.09939368  -26.12606332
  -41.49402826   86.33528853   19.46110219 -138.73699721   87.98650774
   14.0245716   286.06631048  -37.53667388  -65.8913565  -116.97377326
 -125.99632291  110.97277977   17.58062091   16.05753245   78.45038797
 -106.91340305  114.93276949  -10.79488144   18.55833079    1.18168161
    3.83935728   -6.1055232    -3.38986935    9.2002043    11.86248269
  -37.5639492   -23.08081683   -5.94648234   13.20128617    2.54045736
   -3.28639341   11.93495103  -30.36032082]
LASSO Coefficients: [   6.88153     24.960024  -12