In [2]:
import numpy as np
import pandas as pd

In [3]:
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Tom'],
        'Age': [28, np.nan, 34, 29, np.nan],
        'Salary': [70000, 80000, 120000, np.nan, 65000]}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Name,Age,Salary
0,John,28.0,70000.0
1,Anna,,80000.0
2,Peter,34.0,120000.0
3,Linda,29.0,
4,Tom,,65000.0


In [6]:
# Dropping rows with null values
df_dropped = df.dropna()

In [7]:
df_dropped

Unnamed: 0,Name,Age,Salary
0,John,28.0,70000.0
2,Peter,34.0,120000.0


In [8]:
df_filled = df.fillna(df.Age.mean()) # This has to be corrected later
df_filled

Unnamed: 0,Name,Age,Salary
0,John,28.0,70000.0
1,Anna,30.333333,80000.0
2,Peter,34.0,120000.0
3,Linda,29.0,30.333333
4,Tom,30.333333,65000.0


# Removing Duplicates

In [9]:
# Sample data with duplicates
data = {'Name': ['John', 'Anna', 'Peter', 'Anna', 'Tom'],
        'Age': [28, 24, 34, 24, 30],
        'Salary': [70000, 80000, 120000, 80000, 65000]}

In [11]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,Name,Age,Salary
0,John,28,70000
1,Anna,24,80000
2,Peter,34,120000
3,Anna,24,80000
4,Tom,30,65000


In [12]:
df_no_duplicates = df2.drop_duplicates()
df_no_duplicates

Unnamed: 0,Name,Age,Salary
0,John,28,70000
1,Anna,24,80000
2,Peter,34,120000
4,Tom,30,65000


# Dealing With Outliers

In [13]:
# Sample data with an outlier
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Tom'],
        'Age': [28, 24, 34, 29, 200],  # 200 is an outlier
        'Salary': [70000, 80000, 120000, 150000, 65000]}

In [14]:
df3 = pd.DataFrame(data)
df3

Unnamed: 0,Name,Age,Salary
0,John,28,70000
1,Anna,24,80000
2,Peter,34,120000
3,Linda,29,150000
4,Tom,200,65000


In [15]:
# Remove outlier using IQR (Interquartile Range)
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_outliers_removed = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]

print("\nAfter Removing Outliers:")
print(df_outliers_removed)


After Removing Outliers:
    Name   Age    Salary
0   John  28.0   70000.0
2  Peter  34.0  120000.0
3  Linda  29.0       NaN


# Encoding Categorical Data

## One Hot Encoding
One-hot encoding is a crucial technique in machine learning for converting categorical variables into a numerical form that algorithms can process effectively. It transforms each category into a unique binary vector, where only one element is set to 1 ("hot") and the rest are 0 ("cold"). This encoding ensures that no unintended ordinal relationships are introduced between categories, which is important because many algorithms assume numerical inputs have an order or magnitude. By representing categories this way, one-hot encoding allows algorithms to correctly interpret and utilize categorical data without misinterpreting it as numerical values, making it particularly valuable in models like decision trees, neural networks, and other algorithms that handle categorical data.

In [16]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [28]:
# Sample data
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Tom'],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'City': ['New York', 'Paris', 'London', 'New York', 'London']}

In [36]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,City
0,John,Male,New York
1,Anna,Female,Paris
2,Peter,Male,London
3,Linda,Female,New York
4,Tom,Male,London


In [37]:
le = LabelEncoder()
df['Gender_Encoded'] = le.fit_transform(df['Gender'])

In [38]:
df

Unnamed: 0,Name,Gender,City,Gender_Encoded
0,John,Male,New York,1
1,Anna,Female,Paris,0
2,Peter,Male,London,1
3,Linda,Female,New York,0
4,Tom,Male,London,1


In [40]:
df2 = pd.get_dummies(df, columns=['City'])

In [41]:
df2

Unnamed: 0,Name,Gender,Gender_Encoded,City_London,City_New York,City_Paris
0,John,Male,1,False,True,False
1,Anna,Female,0,False,False,True
2,Peter,Male,1,True,False,False
3,Linda,Female,0,False,True,False
4,Tom,Male,1,True,False,False


# Feature Scaling : Normalization and Standarization

In [44]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [45]:
# Sample data
data = {'Name': ['John', 'Anna', 'Peter', 'Linda', 'Tom'],
        'Age': [28, 24, 34, 29, 30],
        'Salary': [70000, 80000, 120000, 150000, 65000]}

In [47]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,John,28,70000
1,Anna,24,80000
2,Peter,34,120000
3,Linda,29,150000
4,Tom,30,65000


In [51]:
scalar = StandardScaler()
df['Age_Scaled'] = scalar.fit_transform(df[['Age']])
df['Salary_Standarized'] = scalar.fit_transform(df[['Salary']])

In [52]:
df

Unnamed: 0,Name,Age,Salary,Age_Scaled,Salary_Standarized
0,John,28,70000,-0.310087,-0.82311
1,Anna,24,80000,-1.550434,-0.518254
2,Peter,34,120000,1.550434,0.701167
3,Linda,29,150000,0.0,1.615734
4,Tom,30,65000,0.310087,-0.975537
