In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
file_path = 'D:/AUTO_ML/data/fish_data.csv'
def preprocess_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Display basic info
    print("Original Data:")
    print(df.info())

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Remove outliers using Isolation Forest
    isolation_forest = IsolationForest(contamination=0.05)
    outliers = isolation_forest.fit_predict(df.select_dtypes(include=[np.number]))
    df = df[outliers == 1]

    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
        ('scaler', StandardScaler())                   # Standardization
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))      # One-hot encoding
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Fit and transform the data
    X_preprocessed = preprocessor.fit_transform(df)

    # Create a DataFrame with the preprocessed data
    # Get feature names after encoding
    cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    all_feature_names = np.concatenate([numerical_cols, cat_feature_names])

    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_feature_names)

    print("\nPreprocessed Data:")
    print(X_preprocessed_df.info())

    return X_preprocessed_df

# Example usage
# preprocessed_data = preprocess_data('path_to_your_file.csv')


In [4]:
df = pd.read_csv("D:/AUTO_ML/data/fish_data.csv")
df

Unnamed: 0,species,length,weight,w_l_ratio
0,Anabas testudineus,10.66,3.45,0.32
1,Anabas testudineus,6.91,3.27,0.47
2,Anabas testudineus,8.38,3.46,0.41
3,Anabas testudineus,7.57,3.36,0.44
4,Anabas testudineus,10.83,3.38,0.31
...,...,...,...,...
4075,Sillaginopsis panijus,30.56,6.12,0.20
4076,Sillaginopsis panijus,29.66,6.11,0.21
4077,Sillaginopsis panijus,32.81,6.25,0.19
4078,Sillaginopsis panijus,29.78,6.11,0.21


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

def preprocess_data(file_path):
    # Load the dataset
    try:
        df = pd.read_csv("D:/AUTO_ML/data/fish_data.csv")
        print("Original Data:")
        print(df.head())  # Display the first few rows
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Outlier removal using Isolation Forest
    isolation_forest = IsolationForest(contamination=0.05)
    outliers = isolation_forest.fit_predict(df.select_dtypes(include=[np.number]))
    df = df[outliers == 1]

    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Check if there are any categorical columns
    if not categorical_cols:
        print("No categorical columns found.")
    else:
        print("Categorical columns:", categorical_cols)

    # Preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
        ('scaler', StandardScaler())                   # Standardization
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))      # One-hot encoding
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Fit and transform the data
    try:
        X_preprocessed = preprocessor.fit_transform(df)

        # Create a DataFrame with the preprocessed data
        cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
        all_feature_names = np.concatenate([numerical_cols, cat_feature_names])

        X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_feature_names)

        print("\nPreprocessed Data:")
        print(X_preprocessed_df.head())  # Display the first few rows of preprocessed data
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

    return X_preprocessed_df

# Example usage
# preprocessed_data = preprocess_data('path_to_your_file.csv')


NameError: name 'X_preprocessed_df' is not defined

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

def preprocess_data(file_path):
    # Load the dataset
    try:
        df = pd.read_csv("D:/AUTO_ML/data/fish_data.csv")
        print("Original Data Loaded Successfully")
        print("Shape of Original Data:", df.shape)
        print("First few rows of the original data:")
        print(df.head())  # Display the first few rows
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Outlier removal using Isolation Forest
    isolation_forest = IsolationForest(contamination=0.05)
    outliers = isolation_forest.fit_predict(df.select_dtypes(include=[np.number]))
    df = df[outliers == 1]
    print("Outliers removed. New shape of data:", df.shape)

    # Identify numerical and categorical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

    # Debugging: Print identified columns
    print("Numerical columns:", numerical_cols)
    print("Categorical columns:", categorical_cols)

    # Preprocessing pipeline
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
        ('scaler', StandardScaler())                   # Standardization
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))      # One-hot encoding
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    # Fit and transform the data
    try:
        X_preprocessed = preprocessor.fit_transform(df)

        # Create a DataFrame with the preprocessed data
        cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
        all_feature_names = np.concatenate([numerical_cols, cat_feature_names])

        X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=all_feature_names)

        print("\nPreprocessed Data:")
        print(X_preprocessed_df.head())  # Display the first few rows of preprocessed data
        print("Shape of Preprocessed Data:", X_preprocessed_df.shape)
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        return None

    return X_preprocessed_df

# Example usage
# preprocessed_data = preprocess_data('path_to_your_file.csv')
