# Data Preprocessing Project

Clean, error-free notebook for data preprocessing and ML readiness.

## 1. Import Libraries

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler


## 2. Load Dataset

In [None]:

# Make sure dataset.csv is in the same folder
df = pd.read_csv("dataset.csv")
df.head()


## 3. Basic Exploration

In [None]:

df.info()
df.describe(include='all')


### Unique values and count

In [None]:

for col in df.columns:
    print(f"\nColumn: {col}")
    print("Unique count:", df[col].nunique())
    print("Unique values:", df[col].unique())


## 4. Rename Columns (standard format)

In [None]:

df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.columns


## 5. Data Cleaning

### Missing values check

In [None]:

df.isnull().sum()


### Replace age = 0 with NaN (if column exists)

In [None]:

if 'age' in df.columns:
    df['age'] = df['age'].replace(0, np.nan)


### Remove duplicate rows

In [None]:

df = df.drop_duplicates()


### Handle missing values

In [None]:

for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

df.isnull().sum()


## 6. Data Analysis

### Filter data: age > 40 and salary < 5000

In [None]:

if 'age' in df.columns and 'salary' in df.columns:
    filtered_df = df[(df['age'] > 40) & (df['salary'] < 5000)]
    filtered_df.head()


### Age vs Salary plot

In [None]:

if 'age' in df.columns and 'salary' in df.columns:
    plt.figure()
    plt.scatter(df['age'], df['salary'])
    plt.xlabel('Age')
    plt.ylabel('Salary')
    plt.title('Age vs Salary')
    plt.show()


### Count of people from each place

In [None]:

if 'place' in df.columns:
    df['place'].value_counts().plot(kind='bar')
    plt.xlabel('Place')
    plt.ylabel('Count')
    plt.title('People from Each Place')
    plt.show()


## 7. Data Encoding

### Label Encoding (Gender)

In [None]:

if 'gender' in df.columns:
    le = LabelEncoder()
    df['gender'] = le.fit_transform(df['gender'])


### One-Hot Encoding (Place)

In [None]:

if 'place' in df.columns:
    df = pd.get_dummies(df, columns=['place'], drop_first=True)

df.head()


## 8. Feature Scaling

### Standard Scaler

In [None]:

df_standard = df.copy()
num_cols = df_standard.select_dtypes(include=np.number).columns

scaler = StandardScaler()
df_standard[num_cols] = scaler.fit_transform(df_standard[num_cols])
df_standard.head()


### Min-Max Scaler

In [None]:

df_minmax = df.copy()
minmax = MinMaxScaler()
df_minmax[num_cols] = minmax.fit_transform(df_minmax[num_cols])
df_minmax.head()


## âœ… Preprocessing Completed Successfully