Removal of missing & duplicate rows, type conversion of columns & normalization of numeric columns

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

#Creating sample data
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva", "Alice", None],
    "Age": [25, 30, 35, 40, 28, 25, 32],
    "Salary": [50000, 60000, None, 70000, 65000, 50000, 62000],
    "Department": ["HR", "IT", "IT", "Finance", "HR", "HR", None]
}

df = pd.DataFrame(data)

df.to_csv("demo_dataset.csv", index=False)
print("Demo CSV file created: demo_dataset.csv")

df = pd.read_csv('demo_dataset.csv')  # Replace with your CSV path
print("Original DataFrame:")
print(df.head())

# Removing rows with missing values
df = df.dropna()

# Removing duplicate rows
df = df.drop_duplicates()

# Converting column types
if 'Age' in df.columns:
    df['Age'] = df['Age'].astype(int)
if 'Salary' in df.columns:
    df['Salary'] = df['Salary'].astype(float)

#Normalizing numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.to_csv('cleaned_data.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_data.csv'")

#Display
print("\nCleaned DataFrame:")
print(df.head())

Demo CSV file created: demo_dataset.csv
Original DataFrame:
      Name  Age   Salary Department
0    Alice   25  50000.0         HR
1      Bob   30  60000.0         IT
2  Charlie   35      NaN         IT
3    David   40  70000.0    Finance
4      Eva   28  65000.0         HR

Cleaned dataset saved as 'cleaned_data.csv'

Cleaned DataFrame:
    Name       Age  Salary Department
0  Alice  0.000000    0.00         HR
1    Bob  0.333333    0.50         IT
3  David  1.000000    1.00    Finance
4    Eva  0.200000    0.75         HR
