# QUESTION 2:

Apply data pre-processing techniques such as standardization/normalization, transformation, aggregation, discretization/binarization, sampling etc. on any dataset

In [None]:
import numpy as np
import pandas as pd

# Create a sample dataset
np.random.seed(0)
data = {'Age': np.random.randint(20, 70, size=100),
      'Income': np.random.randint (20000, 100000, size=100)}

df = pd.DataFrame(data)
print("Original Dataset:")
print("dimension: ", df.shape)
print(df.head())

Original Dataset:
dimension:  (100, 2)
   Age  Income
0   64   91331
1   67   70624
2   20   60133
3   23   75153
4   23   82756


# Standardization and Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)
df_standardized = pd.DataFrame (df_standardized, columns=df.columns)
print("Standardized Dataset")
print(df_standardized.head())

Standardized Dataset
        Age    Income
0  1.464076  1.249772
1  1.668366  0.397580
2 -1.532173 -0.034175
3 -1.327883  0.583970
4 -1.327883  0.896870


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_normalized = scaler.fit_transform(df)
df_normalized = pd.DataFrame (df_normalized, columns=df.columns)
print("Normalized Dataset")
print(df_normalized.head())

Normalized Dataset
        Age    Income
0  0.897959  0.891189
1  0.959184  0.630769
2  0.000000  0.498830
3  0.061224  0.687728
4  0.061224  0.783346


In [None]:
df_standardized.mean()

Unnamed: 0,0
Age,1.1102230000000002e-17
Income,-1.110223e-16


In [None]:
df_standardized.std()

Unnamed: 0,0
Age,1.005038
Income,1.005038


## logarithmic transformation of Income

In [None]:
#Transformation (Logarithmic)
df_transformed = df.copy()
df_transformed['Income'] = np.log(df_transformed['Income'])
print("Transformed Dataset:")
print(df_transformed.head())

Transformed Dataset:
   Age     Income
0   64  11.422246
1   67  11.165125
2   20  11.004314
3   23  11.227281
4   23  11.323652


## Aggregate the data based on age

In [None]:
df_aggregated = df.groupby('Age').mean().reset_index()
print("Aggregated Dataset:")
print(df_aggregated.head())

Aggregated Dataset:
   Age        Income
0   20  50257.600000
1   21  65422.750000
2   23  77726.333333
3   24  58175.000000
4   25  68134.000000


## Discretization and Binarization

In [None]:
bins = [0, 30, 50, np.inf]
labels = ['Young', 'Adult', 'Elderly']
df_discretized = df.copy()
df_discretized['Age_Group'] = pd.cut(df_discretized['Age'], bins=bins, labels=labels)
print("Discretized Dataset:")
print(df_discretized.head())

Discretized Dataset:
   Age  Income Age_Group
0   64   91331   Elderly
1   67   70624   Elderly
2   20   60133     Young
3   23   75153     Young
4   23   82756     Young


In [None]:
from sklearn.preprocessing import Binarizer
# Threshold for binarization
threshold = 30
binarizer = Binarizer (threshold=threshold)
df_binarized = binarizer.fit_transform(df[['Age']])
df_binarized = pd.DataFrame (df_binarized, columns=['Binarized_Age'])
df_binarized = pd.concat([df, df_binarized], axis=1)
print("Binarized Dataset:")
print(df_binarized.head())

Binarized Dataset:
   Age  Income  Binarized_Age
0   64   91331              1
1   67   70624              1
2   20   60133              0
3   23   75153              0
4   23   82756              0


## Random Sampling

In [None]:
df_sampled = df.sample(frac=0.1, random_state=1)
print("Sampled Dataset")
print(df_sampled)

Sampled Dataset
    Age  Income
80   40   35741
84   24   55050
33   38   99128
81   31   77368
93   55   99464
17   59   76894
36   69   38728
82   24   63986
69   55   94253
65   21   80155
