In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Step 1: Data Loading
def load_data(filepath):
    df = pd.read_csv(filepath)
    return df

In [3]:
# Step 2: Data Cleaning
def data_cleaning(df):
    # Handling missing values
    df.fillna(method='ffill', inplace=True)
    return df

In [4]:
# Step 3: Data Transformation
def data_transformation(df):
    # Example transformation: convert date columns to datetime type
    if 'Order Date' in df.columns:
        df['Order Date'] = pd.to_datetime(df['Order Date'])
    return df

In [5]:
# Step 4: Data Integration
def data_integration(df, other_dfs):
    for other_df in other_dfs:
        df = df.merge(other_df, how='left', on='common_column')
    return df

In [6]:
# Step 5: Data Scaling and Encoding
def data_scaling_and_encoding(df):
    # Identifying numerical and categorical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    # Creating transformers for numerical and categorical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combining transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    return preprocessor, numerical_cols, categorical_cols

In [7]:
# Step 6: Feature Engineering
def feature_engineering(df):
    # Example feature engineering: creating a new feature from existing ones
    if 'Order Date' in df.columns:
        df['Order Year'] = df['Order Date'].dt.year
    return df

In [8]:
# Step 7: Data Validation
def data_validation(df):
    print("Data Info:")
    print(df.info())
    print("\nData Description:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())


In [11]:
 def main():
    # Load the main dataset
    df = load_data('Data-1.csv')
    
    # Perform Data Cleaning
    df = data_cleaning(df)
    
    # Perform Data Transformation
    df = data_transformation(df)
    
    # Perform Feature Engineering
    df = feature_engineering(df)
    
    # Perform Data Validation
    data_validation(df)
    
    # Perform Data Scaling and Encoding
    preprocessor, numerical_cols, categorical_cols = data_scaling_and_encoding(df)
    
    # Fit and transform the entire data
    df_transformed = preprocessor.fit_transform(df)
    
    # Check the shape of df_transformed
    print(f"Shape of df_transformed: {df_transformed.shape}")
    
    # Convert the preprocessed data back to DataFrame
    df_preprocessed = pd.DataFrame(df_transformed, columns=range(df_transformed.shape[1]))
    
    # Save the preprocessed data to a CSV file
    df_preprocessed.to_csv('Preprocessed_Data.csv', index=False)

    print("Data preprocessing completed and saved to CSV file.")

if __name__ == "__main__":
    main()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ware_house_ID                 25000 non-null  object 
 1   WH_Manager_ID                 25000 non-null  object 
 2   Location_type                 25000 non-null  object 
 3   WH_capacity_size              25000 non-null  object 
 4   zone                          25000 non-null  object 
 5   WH_regional_zone              25000 non-null  object 
 6   num_refill_req_l3m            25000 non-null  int64  
 7   transport_issue_l1y           25000 non-null  int64  
 8   Competitor_in_mkt             25000 non-null  int64  
 9   retail_shop_num               25000 non-null  int64  
 10  wh_owner_type                 25000 non-null  object 
 11  distributor_num               25000 non-null  int64  
 12  flood_impacted                25000 non-null  int

ValueError: Shape of passed values is (25000, 1), indices imply (25000, 50038)