In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['axes.unicode_minus'] = False

df = pd.read_csv('data/2-descriptor/all_descriptors.csv')


X = df.drop(['ID', 'label', 'SMILES'], axis=1)


X = X.dropna(axis=1, how='all')
X = X.fillna(X.median())
constant_columns = X.columns[X.std() == 0]
X = X.drop(columns=constant_columns)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


pca = PCA()
X_pca = pca.fit_transform(X_scaled)


cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         cumulative_variance_ratio, 'bo-')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance ratio')
plt.title('PCA Cumulative Explained Variance Ratio')
plt.grid(True)
plt.savefig('Plots/3-feature/pca_variance_ratio.png', dpi=300, bbox_inches='tight')
plt.close()


n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"Explain the number of principal components required to account for 95% of the variance: {n_components_95}")


pca_final = PCA(n_components=n_components_95)
X_pca_final = pca_final.fit_transform(X_scaled)


feature_importance = pd.DataFrame(
    data=np.abs(pca_final.components_.T),  # 转置，使特征为行
    index=X.columns,
    columns=[f'PC{i+1}' for i in range(n_components_95)]
)

top_features_per_pc = {}
for i in range(min(5, n_components_95)):  # 展示前5个主成分
    pc_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': np.abs(pca_final.components_[i])
    })
    top_features = pc_importance.nlargest(10, 'importance')
    top_features_per_pc[f'PC{i+1}'] = top_features



pca_df = pd.DataFrame(X_pca_final, columns=[f'PC{i+1}' for i in range(n_components_95)])
pca_df['ID'] = df['ID']
pca_df['label'] = df['label']
pca_df['SMILES'] = df['SMILES']


cols = ['ID', 'SMILES', 'label'] + [col for col in pca_df.columns if col not in ['ID', 'SMILES', 'label']]
pca_df = pca_df[cols]


pca_df.to_csv('data/3-feature/pca_features.csv', index=False)

pca_df

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.bar(range(1, len(pca_final.explained_variance_ratio_) + 1),
        pca_final.explained_variance_ratio_)
plt.xlabel('Principal Components',fontsize=15)
plt.ylabel('Percentage of Explained Variance',fontsize=15)
plt.title('Percentage of Explained Variance by each Principal Component',fontsize=15)


plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_variance_ratio[:n_components_95]) + 1),
         cumulative_variance_ratio[:n_components_95], 'bo-')
plt.axhline(y=0.95, color='r', linestyle='--', alpha=0.7, label='95% cut-off threshold')
plt.axvline(x=n_components_95, color='g', linestyle='--', alpha=0.7,
           label=f'PC{n_components_95}')
plt.xlabel('Number of Components',fontsize=15)
plt.ylabel('Cumulative Explained Variance',fontsize=15)
plt.title('Cumulative Explained Variance by Number of Components',fontsize=15)
plt.legend()

plt.tight_layout()
plt.savefig('Plots/3-feature/pca_variance_ratio.png', dpi=600, bbox_inches='tight')
plt.close()

In [None]:
os.makedirs('Predict/feature/pca', exist_ok=True)

import joblib


pca_model_path = 'Predict/feature/pca/pca_model.joblib'
joblib.dump(pca_final, pca_model_path)

scaler_path = 'Predict/feature/pca/scaler.joblib'
joblib.dump(scaler, scaler_path)

feature_names_path = 'Predict/feature/pca/feature_names.txt'
with open(feature_names_path, 'w') as f:
    f.write('\n'.join(X.columns))

pca_info = {
    'n_components': int(n_components_95),
    'explained_variance_ratio': [float(x) for x in pca_final.explained_variance_ratio_],
    'cumulative_variance_ratio': [float(x) for x in cumulative_variance_ratio]
}

import json
pca_info_path = 'Predict/feature/pca/pca_info.json'
with open(pca_info_path, 'w') as f:
    json.dump(pca_info, f, indent=4)


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('data/2-descriptor/all_descriptors.csv')

X = df.drop(['ID', 'label', 'SMILES'], axis=1)
y = df['label']


X = X.dropna(axis=1, how='all')
X = X.fillna(X.median())
constant_columns = X.columns[X.std() == 0]
X = X.drop(columns=constant_columns)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

k = 20
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X_scaled, y)

scores = pd.DataFrame({
    'feature': X.columns,
    'score': selector.scores_,
    '': selector.pvalues_
})

scores = scores.sort_values('score', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=scores.head(20), x='score', y='feature')
plt.title('Top 20 Features by F-score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('Plots/3-feature/kbest_scores.png', dpi=300, bbox_inches='tight')
plt.close()

selected_features = scores.head(k)['feature'].tolist()
X_selected_df = X[selected_features]
X_selected_df['ID'] = df['ID']
X_selected_df['label'] = df['label']
X_selected_df['SMILES'] = df['SMILES']

cols = ['ID', 'SMILES', 'label'] + selected_features
X_selected_df = X_selected_df[cols]

X_selected_df.to_csv('data/3-feature/kbest_features.csv', index=False)


X_selected_df

In [None]:
import os
import joblib
import json


os.makedirs('Predict/feature/Kbest', exist_ok=True)


kbest_model_path = 'Predict/feature/Kbest/kbest_model.joblib'
joblib.dump(selector, kbest_model_path)


scaler_path = 'Predict/feature/Kbest/scaler.joblib'
joblib.dump(scaler, scaler_path)


feature_info = {
    'original_features': X.columns.tolist(),
    'selected_features': selected_features,
    'feature_scores': {
        'features': scores['feature'].tolist(),
        'scores': [float(x) for x in scores['score']],
        'p_values': [float(x) for x in scores['p']]
    },
    'k_value': k
}

feature_info_path = 'Predict/feature/Kbest/feature_info.json'
with open(feature_info_path, 'w') as f:
    json.dump(feature_info, f, indent=4)


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier  # 改为随机森林
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('data/2-descriptor/all_descriptors.csv')


X = df.drop(['ID', 'label', 'SMILES'], axis=1)
y = df['label']


X = X.dropna(axis=1, how='all')
X = X.fillna(X.median())
constant_columns = X.columns[X.std() == 0]
X = X.drop(columns=constant_columns)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)


n_features_to_select = 100
estimator = RandomForestClassifier(n_estimators=100, random_state=42)
selector = RFE(estimator=estimator, n_features_to_select=n_features_to_select, step=0.1)


selector = selector.fit(X_scaled, y)

feature_ranking = pd.DataFrame({
    'feature': X.columns,
    'rank': selector.ranking_,
    'select': selector.support_
})


feature_ranking = feature_ranking.sort_values('rank')


plt.figure(figsize=(12, 6))
selected_features = feature_ranking[feature_ranking['select']].head(20)
plt.bar(range(len(selected_features)), [1]*len(selected_features))
plt.xticks(range(len(selected_features)), selected_features['feature'], rotation=45, ha='right')
plt.title('Top 20 Selected Features by RFE')
plt.tight_layout()
plt.savefig('Plots/3-feature/rfe_features.png', dpi=300, bbox_inches='tight')
plt.close()

selected_features_all = feature_ranking[feature_ranking['select']]['feature'].tolist()
X_selected_df = X[selected_features_all]
X_selected_df['ID'] = df['ID']
X_selected_df['label'] = df['label']
X_selected_df['SMILES'] = df['SMILES']

cols = ['ID', 'SMILES', 'label'] + selected_features_all
X_selected_df = X_selected_df[cols]

X_selected_df.to_csv('data/3-feature/rfe_features.csv', index=False)

X_selected_df

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import joblib
import json

os.makedirs('Predict/feature/RFE', exist_ok=True)

rfe_model_path = 'Predict/feature/RFE/rfe_model.joblib'
joblib.dump(selector, rfe_model_path)

scaler_path = 'Predict/feature/RFE/scaler.joblib'
joblib.dump(scaler, scaler_path)

feature_info = {
    'original_features': X.columns.tolist(),
    'selected_features': selected_features['feature'].tolist(),
    'feature_ranking': {
        'features': feature_ranking['feature'].tolist(),
        'rankings': [int(x) for x in feature_ranking['rank']],
        'is_selected': feature_ranking['select'].tolist()
    },
    'n_features_selected': n_features_to_select
}

feature_info_path = 'Predict/feature/RFE/feature_info.json'
with open(feature_info_path, 'w') as f:
    json.dump(feature_info, f, indent=4)


In [None]:
df = pd.read_csv("data/2-descriptor/all_descriptors.csv")


features = df.drop(['ID', 'label', 'SMILES'], axis=1) if 'SMILES' in df.columns else df.drop(['label'], axis=1)

print("\nColumns with non-numeric data types:")
non_numeric_cols = features.select_dtypes(exclude=['int64', 'float64']).columns
if len(non_numeric_cols) > 0:
    print(non_numeric_cols.tolist())
    print("\nSample values from non-numeric columns:")
    for col in non_numeric_cols:
        print(f"\n{col}:")
        print(features[col].value_counts().head())


    features = features.select_dtypes(include=['int64', 'float64'])
    print(f"\nRemoved {len(non_numeric_cols)} non-numeric columns")
else:
    print("All columns are numeric")

print(f"\nFinal shape: {features.shape}")


print(f"\nHandling missing values...")
print(f"Original shape: {features.shape}")


features = features.dropna(axis=1, how='all')
print(f"Shape after dropping all-NaN columns: {features.shape}")


features = features.fillna(features.median())
print(f"Shape after filling NaN values: {features.shape}")


if features.isna().any().any():
    print("Warning: There are still NaN values in the dataset!")
else:
    print("All NaN values have been handled successfully.")


scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
features_scaled = pd.DataFrame(features_scaled, columns=features.columns)


df_processed = pd.DataFrame()
df_processed['ID'] = df['ID']
df_processed['SMILES'] = df['SMILES']
df_processed['label'] = df['label']
for col in features_scaled.columns:
    df_processed[col] = features_scaled[col]


output_path = f'data/3-feature/nofs_features.csv'
df_processed.to_csv(output_path, index=False)
print(f"\nProcessed data saved to: {output_path}")

In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')


csv_files = [f for f in os.listdir('data/3-feature') if f.endswith('.csv')]

for csv_file in csv_files:
    df = pd.read_csv(f'data/3-feature/{csv_file}')

    df = df.drop(['ID', 'SMILES'], axis=1)

    X = df.drop('label', axis=1)
    y = df['label']

    plt.figure(figsize=(10, 4))


    plt.subplot(1, 2, 1)

    sns.countplot(data=df, x='label', hue='label', palette='Set2', legend=False)
    plt.title(f'Original Distribution\n({csv_file})')


    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)


    df_resampled = pd.DataFrame(
        np.column_stack([X_resampled, y_resampled]),
        columns=X.columns.tolist() + ['label']
    )


    df_resampled['label'] = df_resampled['label'].astype(int)


    output_filename = f'data/4-upsample/upsampled_{csv_file}'
    df_resampled.to_csv(output_filename, index=False)

    print(df_resampled['label'].value_counts())


    plt.subplot(1, 2, 2)
    sns.countplot(data=df_resampled, x='label', hue='label', palette='Set2', legend=False)
    plt.title('SMOTE Upsampled Distribution')

    plt.tight_layout()
    plt.savefig(f'data/4-upsample/upsampled_plot_{csv_file.replace(".csv", ".png")}')
    plt.show()
    plt.close()
