In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
url = "C:/Users/Admin/Desktop/Ram_Datamininig/Chapter2/DataPreprocessing.csv"
df = pd.read_csv(url)

print("Original dataset shape:", df.shape)
print("Columns in dataset:", df.columns.tolist())  # Check column names

# Handle missing values
df = df.dropna()
print("After dropping missing values:", df.shape)

# Encode categorical variables
categorical_cols = df.select_dtypes(include='object').columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# ✅ Correct target column
target_column = 'Online Shopper'

# Ensure the column exists
if target_column not in df.columns:
    raise ValueError(f"Column '{target_column}' not found in dataset. Available columns: {df.columns.tolist()}")

# Split features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Final Preprocessed Training Data Shape:", X_train_scaled.shape)
print("Final Preprocessed Test Data Shape:", X_test_scaled.shape)


Original dataset shape: (10, 4)
Columns in dataset: ['Region', 'Age', 'Income', 'Online Shopper']
After dropping missing values: (8, 4)
Final Preprocessed Training Data Shape: (6, 3)
Final Preprocessed Test Data Shape: (2, 3)
