In [9]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns, method='iqr', threshold=1.5):
        self.columns = columns
        self.method = method
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X_ = X.copy()
        else:
            X_ = pd.DataFrame(X, columns=self.columns)

        for i, col in enumerate(self.columns):
            print(f"Processing column: {col}")
            print(f"Column type: {X_[col].dtype}")
            print(f"Column head: {X_[col].head()}")
            
            if X_[col].dtype == object:
                print(f"Column {col} is of type object. Skipping...")
                continue
            
            if self.method == 'iqr':
                Q1 = X_[col].quantile(0.25)
                Q3 = X_[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.threshold * IQR
                upper_bound = Q3 + self.threshold * IQR
                X_.loc[(X_[col] < lower_bound) | (X_[col] > upper_bound), col] = np.nan
            elif self.method == 'zscore':
                z_scores = (X_[col] - X_[col].mean()) / X_[col].std()
                X_.loc[(z_scores < -self.threshold) | (z_scores > self.threshold), col] = np.nan

        return X_.values if isinstance(X, np.ndarray) else X_

df = pd.read_csv('auto-mpg.csv')

numeric_features = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration', 'cylinders', 'model year', 'origin']
categorical_features = ['car name']

df['horsepower'] = pd.to_numeric(df['horsepower'].replace('?', np.nan), errors='coerce')

df['cylinders'] = df['cylinders'].astype(int)
df['model year'] = df['model year'].astype(int)

print(df.dtypes)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('outlier_remover', OutlierRemover(columns=numeric_features, method='iqr')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

full_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

df_transformed = full_pipeline.fit_transform(df)

feature_names = (numeric_features +
    full_pipeline.named_steps['preprocessor']
    .named_transformers_['cat']
    .named_steps['onehot']
    .get_feature_names_out(categorical_features).tolist()
)

df_final = pd.DataFrame(df_transformed, columns=feature_names)

print(df_final.head())
print(df_final.shape)

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model year        int64
origin            int64
car name         object
dtype: object
Processing column: mpg
Column type: float64
Column head: 0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64
Processing column: displacement
Column type: float64
Column head: 0    307.0
1    350.0
2    318.0
3    304.0
4    302.0
Name: displacement, dtype: float64
Processing column: horsepower
Column type: float64
Column head: 0    130.0
1    165.0
2    150.0
3    150.0
4    140.0
Name: horsepower, dtype: float64
Processing column: weight
Column type: float64
Column head: 0    3504.0
1    3693.0
2    3436.0
3    3433.0
4    3449.0
Name: weight, dtype: float64
Processing column: acceleration
Column type: float64
Column head: 0    12.0
1    11.5
2    11.0
3    12.0
4    10.5
Name: acceleration, dtype: float64
Processing column: cylinders


In [12]:
df_final.to_csv('auto-mpg-processed.csv', index=False)
df_final.head(-15)

one_hot_columns = full_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)
print(one_hot_columns)

for col in one_hot_columns:
    print(df_final[col].value_counts())

    


['car name_amc ambassador dpl' 'car name_amc ambassador sst'
 'car name_amc concord' 'car name_amc concord d/l'
 'car name_amc concord dl' 'car name_amc concord dl 6'
 'car name_amc gremlin' 'car name_amc hornet'
 'car name_amc hornet sportabout (sw)' 'car name_amc matador'
 'car name_amc matador (sw)' 'car name_amc pacer' 'car name_amc pacer d/l'
 'car name_amc rebel sst' 'car name_amc spirit dl' 'car name_audi 100 ls'
 'car name_audi 100ls' 'car name_audi 4000' 'car name_audi 5000'
 'car name_audi 5000s (diesel)' 'car name_audi fox' 'car name_bmw 2002'
 'car name_bmw 320i' 'car name_buick century' 'car name_buick century 350'
 'car name_buick century limited' 'car name_buick century luxus (sw)'
 'car name_buick century special' 'car name_buick electra 225 custom'
 'car name_buick estate wagon (sw)' 'car name_buick lesabre custom'
 'car name_buick opel isuzu deluxe'
 'car name_buick regal sport coupe (turbo)' 'car name_buick skyhawk'
 'car name_buick skylark' 'car name_buick skylark 3

In [11]:
df_final[col].value_counts()

car name_vw rabbit custom
0.0    397
1.0      1
Name: count, dtype: int64