In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [3]:
# Check the initial size of the dataset
initial_size = data.shape
print("Initial data size:", initial_size)

Initial data size: (150, 5)


In [4]:
# Assuming 'sepal length (cm)' column has missing values
data['sepal length (cm)'].fillna(data['sepal length (cm)'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sepal length (cm)'].fillna(data['sepal length (cm)'].mean(), inplace=True)


In [5]:
X = data.drop('target', axis=1)  # Features
y = data['target']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test = scaler.transform(X_test)

In [7]:
# Check the size of the dataset after pre-processing
final_size_train = X_train.shape
final_size_test = X_test.shape
print("Training data size after pre-processing:", final_size_train)
print("Testing data size after pre-processing:", final_size_test)

Training data size after pre-processing: (120, 4)
Testing data size after pre-processing: (30, 4)


In [8]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object for the target variable
target_scaler = MinMaxScaler()

# Reshape the target variable to a 2D array
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

# Fit the scaler to the training target data and transform it
y_train_scaled = target_scaler.fit_transform(y_train)

# Transform the test target data using the fitted scaler
y_test_scaled = target_scaler.transform(y_test)

In [9]:
from tensorflow.keras.utils import to_categorical

# Assuming 'y_train_scaled' and 'y_test_scaled' are your scaled target variables
y_train_categorical = to_categorical(y_train_scaled)
y_test_categorical = to_categorical(y_test_scaled)

In [11]:
# Create DataFrames for the scaled data
scaled_train_data = pd.DataFrame(X_train, columns=X.columns)
# Extract the class labels (0, 1, or 2) from the one-hot encoded array
# Since to_categorical was applied to MinMaxScaled data (0-1 range), argmax finds the index of the max value,
# which is also the index of class label 0, 1, 2
# This is due to a bit of a hack where to_categorical is used where it shouldn't be (see discussion for more)
scaled_train_data['target'] = np.argmax(y_train_categorical, axis=1)

scaled_test_data = pd.DataFrame(X_test, columns=X.columns)
scaled_test_data['target'] = np.argmax(y_test_categorical, axis=1)

# Save the DataFrames to CSV files
scaled_train_data.to_csv('train_data_scaled.csv', index=False)
scaled_test_data.to_csv('test_data_scaled.csv', index=False)