In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:

# Load the dataset
data = pd.read_csv('Data-1.csv')


In [3]:
# Display the column names
print("Column Names:")
print(data.columns)

Column Names:
Index(['Ware_house_ID', 'WH_Manager_ID', 'Location_type', 'WH_capacity_size',
       'zone', 'WH_regional_zone', 'num_refill_req_l3m', 'transport_issue_l1y',
       'Competitor_in_mkt', 'retail_shop_num', 'wh_owner_type',
       'distributor_num', 'flood_impacted', 'flood_proof', 'electric_supply',
       'dist_from_hub', 'workers_num', 'wh_est_year',
       'storage_issue_reported_l3m', 'temp_reg_mach',
       'approved_wh_govt_certificate', 'wh_breakdown_l3m', 'govt_check_l3m',
       'product_wg_ton'],
      dtype='object')


In [4]:
# Separate features and target variable
X = data.drop(columns=['product_wg_ton'])
y = data['product_wg_ton']

In [5]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [6]:
# Define preprocessing steps for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=True))  # Use sparse=True for sparse matrix
])

In [7]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [8]:
# Preprocess the features and concatenate with target variable
processed_data = pd.concat([pd.DataFrame(preprocessor.fit_transform(X)), y], axis=1)

In [9]:
# Save the processed data to a new CSV file
processed_data.to_csv('Processed_Data-1.csv', index=False)

print("Data preprocessing completed and saved as 'Processed_Data-1.csv'")

Data preprocessing completed and saved as 'Processed_Data-1.csv'
