### Meta Feature Extraction Regression

In [23]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import entropy

In [4]:
df=pd.read_csv(r'datasets\regression\car_price_prediction_.csv')
df.head()

Unnamed: 0,Car ID,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,1,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,2,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series
2,3,Audi,2013,4.5,Electric,Manual,181601,New,44402.61,A4
3,4,Tesla,2011,4.1,Diesel,Automatic,68682,New,86374.33,Model Y
4,5,Ford,2009,2.6,Diesel,Manual,223009,Like New,73577.1,Mustang


In [6]:
df.drop(columns=['Car ID'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,Brand,Year,Engine Size,Fuel Type,Transmission,Mileage,Condition,Price,Model
0,Tesla,2016,2.3,Petrol,Manual,114832,New,26613.92,Model X
1,BMW,2018,4.4,Electric,Manual,143190,Used,14679.61,5 Series
2,Audi,2013,4.5,Electric,Manual,181601,New,44402.61,A4
3,Tesla,2011,4.1,Diesel,Automatic,68682,New,86374.33,Model Y
4,Ford,2009,2.6,Diesel,Manual,223009,Like New,73577.1,Mustang


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         2500 non-null   object 
 1   Year          2500 non-null   int64  
 2   Engine Size   2500 non-null   float64
 3   Fuel Type     2500 non-null   object 
 4   Transmission  2500 non-null   object 
 5   Mileage       2500 non-null   int64  
 6   Condition     2500 non-null   object 
 7   Price         2500 non-null   float64
 8   Model         2500 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 175.9+ KB


In [None]:
num_cols=list(df.select_dtypes(exclude=['O']).columns)
num_cols

['Year', 'Engine Size', 'Mileage', 'Price']

In [22]:
numeric_df=df.select_dtypes(exclude=['O','category'])

In [20]:
missing_values_pct = df.isnull().mean().mean() * 100
missing_values_pct

np.float64(0.0)

In [None]:
scaled=StandardScaler().fit_transform(numeric_df.fillna(numeric_df.mode().iloc[0]))
pca=PCA().fit(scaled)
explained=np.cumsum(pca.explained_variance_ratio_)

if np.any(explained>=0.95):
    num_components=np.argmax(explained>=0.95)+1
else:
    num_components=numeric_df.shape[1]
pca_fraction_95=num_components/numeric_df.shape[1]


In [24]:
def mean_feature_entropy_auto(numeric_df):
    if numeric_df.shape[1] == 0:
        return np.nan
    entropies = []
    for col in numeric_df.columns:
        vals = numeric_df[col].dropna()
        if vals.nunique() > 1:
            if np.issubdtype(vals.dtype, np.integer) and vals.nunique() < 20:
                probs = vals.value_counts(normalize=True)
                entropies.append(entropy(probs))
            else:
                hist, _ = np.histogram(vals, bins=10, density=True)
                hist = hist[hist > 0]
                entropies.append(entropy(hist))
    return np.mean(entropies) if entropies else np.nan

In [None]:
# n_instances,n_features,n_num_features,n_cat_features,missing_values_pct,
# mean_skewness,mean_kurtosis,avg_correlation,max_correlation,mean_corr_with_target,
# max_corr_with_target,pca_fraction_95,var_mean,var_std,mean_feature_entropy,
# feature_to_instance_ratio,best_model


def meta_features_extract_reg(dataset:pd.DataFrame, target_col:str):
    dest=pd.read_csv('meta_learning\\meta_regression\\meta_feautures_regression.csv')
    df=pd.read_csv(dataset)
    
    n_features=df.shape[1]-1
    
    n_instances=df.shape[0]
    
    target=df[target_col]
    numeric_df = pd.DataFrame(df.drop(columns=[target_col]).select_dtypes(exclude=['object', 'category']))
    num_cols=numeric_df.columns
    n_num_features=len(num_cols)
    
    n_cat_features=n_features-n_num_features
    
    missing_values_pct = df.isnull().mean().mean() * 100
   
    mean_skewness=np.nan
    mean_kurtosis=np.nan
    avg_correlation=np.nan
    max_correlation=np.nan
    mean_corr_with_target=np.nan
    max_corr_with_target=np.nan
    var_mean=np.nan
    var_std=np.nan
    if numeric_df.shape[1] > 0:
        mean_skewness = numeric_df.skew().mean()
        mean_kurtosis = numeric_df.kurtosis().replace([np.inf, -np.inf], np.nan).mean()
        corr_matrix = numeric_df.corr().abs()
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        avg_correlation = upper_triangle.stack().mean()
        max_correlation = upper_triangle.stack().max()
        corrs = numeric_df.corrwith(target).abs()
        mean_corr_with_target = corrs.mean()
        max_corr_with_target = corrs.max()
        var_mean=numeric_df.mean().var()
        var_std=numeric_df.std().var()
        feature_to_instance_ratio = n_features / n_instances if n_instances > 0 else np.nan

        
        
    if numeric_df.shape[1]>1:
        try:
            scaled=StandardScaler().fit_transform(numeric_df.fillna(numeric_df.mode().iloc[0]))
            pca=PCA().fit(scaled)
            explained=np.cumsum(pca.explained_variance_ratio_)

            if np.any(explained>=0.95):
                num_components=np.argmax(explained>=0.95)+1
            else:
                num_components=numeric_df.shape[1]
            pca_fraction_95=num_components/numeric_df.shape[1]
        except Exception:
            pca_fraction_95=np.nan
    else:
        pca_fraction_95=np.nan
    
    
    mean_feature_entropy=mean_feature_entropy_auto()
    
    meta_features = {
        "n_instances": n_instances,
        "n_features": n_features,
        "n_num_features": n_num_features,
        "n_cat_features": n_cat_features,
        "missing_values_pct": missing_values_pct,
        "mean_skewness": mean_skewness,
        "mean_kurtosis": mean_kurtosis,
        "avg_correlation": avg_correlation,
        "max_correlation": max_correlation,
        "mean_corr_with_target": mean_corr_with_target,
        "max_corr_with_target": max_corr_with_target,
        "pca_fraction_95": pca_fraction_95,
        "var_mean": var_mean,
        "var_std": var_std,
        "mean_feature_entropy": mean_feature_entropy,
        "feature_to_instance_ratio": feature_to_instance_ratio
    }
    
    meta_row = pd.DataFrame([meta_features])

    dest = pd.concat([dest, meta_row], ignore_index=True)
    
    dest.to_csv("meta_learning/meta_regression/meta_features_regression.csv", index=False)


    
    


