<a href="https://colab.research.google.com/github/Storm00212/Data-manipulation-using-python/blob/main/Simple_notebook_to_understand_dataset_cleaning_and_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [12]:
data = {
    'Name': ['John','John', 'Anna', 'Peter', 'Linda', 'James', None],
    'Age': [28,28 ,22, np.nan, 32, 40, 25],
    'Gender': ['Male','Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Income': [50000,50000, 60000, 55000, None, 70000, 62000]
}
df = pd.DataFrame(data)
print(df)

    Name   Age  Gender   Income
0   John  28.0    Male  50000.0
1   John  28.0    Male  50000.0
2   Anna  22.0  Female  60000.0
3  Peter   NaN    Male  55000.0
4  Linda  32.0  Female      NaN
5  James  40.0    Male  70000.0
6   None  25.0  Female  62000.0


In [13]:
# HANDLING NULL VALUES
print(df.isnull().sum())
# to handle null values you can either drop them or fill missing data
# dropping   df_drop = df.dropna()
#filling missing data
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Income'] = df['Income'].fillna(df['Income'].mean())
df['Name'] = df['Name'].fillna('Unknown')
display(df)

Name      1
Age       1
Gender    0
Income    1
dtype: int64


Unnamed: 0,Name,Age,Gender,Income
0,John,28.0,Male,50000.0
1,John,28.0,Male,50000.0
2,Anna,22.0,Female,60000.0
3,Peter,29.166667,Male,55000.0
4,Linda,32.0,Female,57833.333333
5,James,40.0,Male,70000.0
6,Unknown,25.0,Female,62000.0


In [14]:
# HANDLING NULL VALUES
print(df.isnull().sum())
# to handle null values you can either drop them or fill missing data
# dropping   df_drop = df.dropna()
#filling missing data
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Income'] = df['Income'].fillna(df['Income'].mean())
df['Name'] = df['Name'].fillna('Unknown')
display(df)

Name      0
Age       0
Gender    0
Income    0
dtype: int64


Unnamed: 0,Name,Age,Gender,Income
0,John,28.0,Male,50000.0
1,John,28.0,Male,50000.0
2,Anna,22.0,Female,60000.0
3,Peter,29.166667,Male,55000.0
4,Linda,32.0,Female,57833.333333
5,James,40.0,Male,70000.0
6,Unknown,25.0,Female,62000.0


In [18]:
# Handling duplicates
df = df.drop_duplicates()

display(df)

Unnamed: 0,Name,Age,Gender,Income
0,John,28.0,Male,50000.0
2,Anna,22.0,Female,60000.0
3,Peter,29.166667,Male,55000.0
4,Linda,32.0,Female,57833.333333
5,James,40.0,Male,70000.0
6,Unknown,25.0,Female,62000.0


In [19]:
# convert float to interger
df['Age'] = df['Age'].astype(int)
display(df)

Unnamed: 0,Name,Age,Gender,Income
0,John,28,Male,50000.0
2,Anna,22,Female,60000.0
3,Peter,29,Male,55000.0
4,Linda,32,Female,57833.333333
5,James,40,Male,70000.0
6,Unknown,25,Female,62000.0


In [21]:
# Encoding
# This handles categorical values like gender and turns them to numerical form.
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])
display(df)
# one hot encoding for more specifications
df_encoded = pd.get_dummies(df, columns=['Gender'], drop_first=True)
display(df_encoded)


Unnamed: 0,Name,Age,Gender,Income
0,John,28,1,50000.0
2,Anna,22,0,60000.0
3,Peter,29,1,55000.0
4,Linda,32,0,57833.333333
5,James,40,1,70000.0
6,Unknown,25,0,62000.0


Unnamed: 0,Name,Age,Income,Gender_1
0,John,28,50000.0,True
2,Anna,22,60000.0,False
3,Peter,29,55000.0,True
4,Linda,32,57833.333333,False
5,James,40,70000.0,True
6,Unknown,25,62000.0,False


In [22]:
# Feature scaling
# Brings numerical features.
# prevents model from overtly focusing on large values.
scaler = MinMaxScaler()
df[['Age', 'Income']] = scaler.fit_transform(df[['Age', 'Income']])
display(df)


Unnamed: 0,Name,Age,Gender,Income
0,John,0.333333,1,0.0
2,Anna,0.0,0,0.5
3,Peter,0.388889,1,0.25
4,Linda,0.555556,0,0.391667
5,James,1.0,1,1.0
6,Unknown,0.166667,0,0.6


In [23]:
# Splitting the dataset.
from sklearn.model_selection import train_test_split

X =df[['Age', 'Gender', 'Income']]
y = df['Income'] > 60000

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (4, 3)
X_test shape: (2, 3)
y_train shape: (4,)
y_test shape: (2,)
