In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing(df):
  #identifying numeric and categorical features
  categorical_features = []
  numeric_features = []
  for features in df.columns:
    if df[features].dtype=='O':
      categorical_features.append(features)
    else:
      numeric_features.append(features)
  print(f"Categorical columns in the given dataset:{categorical_features}")
  print(f"Numerical columns in the given dataset: {numeric_features}")

  #handling missing values
  #for loop?
  #for features in numeric_features:
  df[numeric_features]=df[numeric_features].fillna(df[numeric_features].mean())

  #Handling outliers
  for features in numeric_features:
    Q1=df[features].quantile(0.25)
    Q3=df[features].quantile(0.75)
    IQR=Q3-Q1
    lower_bound=Q1-(1.5*IQR)
    upper_bound=Q3+(1.5*IQR)
    df[features]=np.where((df[features]<lower_bound) | (df[features]>upper_bound),df[features].mean(),df[features])

  #Normalize numeric features
  scaler=StandardScaler()
  scaled_data=scaler.fit_transform(df[numeric_features])
  df[numeric_features]=scaler.transform(df[numeric_features])

  #Handling missing data
  df[categorical_features]=df[categorical_features].fillna(df[categorical_features].mode().iloc[0])

  return df

In [21]:
data=pd.read_csv('/content/sample_data/data (1).csv')
print(data.head())

   NumericFeature1  NumericFeature2 CategoricalFeature
0              1.0                7                  A
1              2.0                8                  B
2              NaN                9                NaN
3              4.0               10                  A
4              5.0               11                  B


In [22]:
data.isnull().sum()

NumericFeature1       1
NumericFeature2       0
CategoricalFeature    1
dtype: int64

In [23]:
transformed_data=data_preprocessing(data)
print(transformed_data)

Categorical columns in the given dataset:['CategoricalFeature']
Numerical columns in the given dataset: ['NumericFeature1', 'NumericFeature2']
   NumericFeature1  NumericFeature2 CategoricalFeature
0        -1.535624        -1.099370                  A
1        -0.944999        -0.749128                  B
2         0.000000        -0.398886                  A
3         0.236250        -0.048645                  A
4         0.826874         0.301597                  B
5         1.417499         1.994431                  C
