<a href="https://colab.research.google.com/github/RisaRules6762/protein-RMSD-project/blob/feature-engineering/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import skew

# Load dataset
df = pd.read_csv("proteinstructure.zip")

# Preview columns
print("Original Columns:", df.columns.tolist())

# 1. Create New Features
# Ratio of F1 to F2 (exposed area ratios)
df['F1_F2_ratio'] = df['F1'] / (df['F2'] + 1e-6)

# Total exposed surface area (sum of F1, F2, F5)
df['exposed_sum'] = df['F1'] + df['F2'] + df['F5']

# Mean of F6, F7, F8 (spatial constraints)
df['spatial_mean'] = df[['F6', 'F7', 'F8']].mean(axis=1)

# Difference between F4 and F3 (non-polar vs polar area)
df['F4_minus_F3'] = df['F4'] - df['F3']

# 2. Encode Categorical Features
# This dataset has no categorical features by default, but we'll simulate one:
df['RMSD_quality'] = pd.cut(df['RMSD'], bins=[-np.inf, 2, 5, np.inf], labels=['High', 'Medium', 'Low'])

# Apply Label Encoding (ordinal data)
label_encoder = LabelEncoder()
df['RMSD_quality_encoded'] = label_encoder.fit_transform(df['RMSD_quality'])

# 3. Binning or Discretization
# Bin F7 into quartiles
df['F7_bin'] = pd.qcut(df['F7'], q=4, labels=False)

# Bin RMSD into equal-width bins
df['RMSD_bin'] = pd.cut(df['RMSD'], bins=4, labels=False)

# 4. Scaling or Normalization
# Select numerical columns (excluding target RMSD for now)
numerical_cols = [col for col in df.columns if col.startswith('F')]

# Scale numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Scale newly created interaction features
interaction_cols = ['F1_F2_ratio', 'exposed_sum', 'spatial_mean', 'F4_minus_F3']
df[interaction_cols] = scaler.fit_transform(df[interaction_cols])

# 5. Transform Skewed Data
# Identify skewed columns
all_numeric_cols = numerical_cols + interaction_cols
skewed_feats = df[all_numeric_cols].apply(lambda x: skew(x.dropna()))
skewed_cols = skewed_feats[abs(skewed_feats) > 1].index

# Apply log1p to reduce skewness
for col in skewed_cols:
    df[col + '_log'] = np.log1p(df[col])

# 6. Create Interaction Features
# Multiply features with strong individual signal
df['F3_times_F4'] = df['F3'] * df['F4']
df['F8_times_F9'] = df['F8'] * df['F9']
df['F2_plus_F9'] = df['F2'] + df['F9']

# Final Preview
print("\nEngineered Dataset Shape:", df.shape)
print("\nSample Rows:")
print(df.head())

# df.to_csv("CASP_feature_engineered.csv", index=False)



Original Columns: ['RMSD', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']

Engineered Dataset Shape: (45730, 32)

Sample Rows:
     RMSD        F1        F2        F3        F4        F5        F6  \
0  17.284  0.908481  0.879584  0.240885  1.058750  0.894440  0.996035   
1   6.021 -0.906740 -0.952127 -0.640245 -0.903989 -1.001457 -0.834813   
2   9.275 -0.528725 -0.881704 -1.255652 -0.653210 -0.518857 -0.912117   
3  15.851 -0.356576 -0.443293 -0.338425 -0.643398 -0.279821 -0.517141   
4   7.962 -0.594061 -0.874424 -1.106651 -0.921619 -0.615710 -0.730225   

         F7        F8        F9  ...    F6_log    F7_log    F8_log  \
0  0.149539  0.566885 -1.253153  ...  0.691163  0.139361  0.449090   
1 -0.331492 -0.548301  0.672800  ... -1.800680 -0.402706 -0.794739   
2 -0.505989 -0.725315  0.717134  ... -2.431744 -0.705198 -1.292129   
3 -0.371967  0.000441  0.759477  ... -0.728030 -0.465163  0.000441   
4 -0.589569 -0.512898  0.901558  ... -1.310169 -0.890546 -0.719282   

   F1_

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
