# Handling Numerical data

Ghanshyam Kanojiya | T087

3.1 Rescaling a feature

In [None]:
import pandas as pd
from sklearn import preprocessing

# load dataset
df = pd.read_csv("lung_cancer_survey.csv")

# create a feature
feature = df[['Population']].values             

# create scaler
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

# scale feature
scaled_feature = minmax_scaler.fit_transform(feature)

print(scaled_feature[:10])  # print first 10 scaled values

[[0.55040354]
 [0.59609298]
 [0.44344092]
 [0.06977401]
 [0.01326533]
 [0.47886217]
 [0.99647199]
 [0.0727577 ]
 [0.97548535]
 [0.53227963]]


3.2 Standardizing a Feature

In [22]:
import pandas as pd
from sklearn import preprocessing

# load your dataset
df = pd.read_csv("world_bank_dataset.csv") 

# create a feature
feature = df[['Population']].values  

# create scaler
scaler = preprocessing.StandardScaler()

# transform the feature
standardized = scaler.fit_transform(feature)

print(standardized[:10])  # print first 10 standardized values

print("Mean and Standard Deviation")

print("Mean: {}".format(round(standardized.mean())))
print("Standard Deviation: {}".format(standardized.std()))

print("Using RobustScaler")

# create scaler
robust_scaler = preprocessing.RobustScaler()

# transform feature
robust_scaled = robust_scaler.fit_transform(feature)

print(robust_scaled[:10])  # print first 10 robust scaled values



[[ 0.20605122]
 [ 0.36131615]
 [-0.15743633]
 [-1.42725637]
 [-1.61928796]
 [-0.03706548]
 [ 1.72191102]
 [-1.41711699]
 [ 1.65059283]
 [ 0.14446135]]
Mean and Standard Deviation
Mean: 0
Standard Deviation: 1.0
Using RobustScaler
[[ 0.14556278]
 [ 0.23775479]
 [-0.07026607]
 [-0.82425007]
 [-0.93827311]
 [ 0.00120681]
 [ 1.04563837]
 [-0.81822959]
 [ 1.0032916 ]
 [ 0.10899242]]


# 3.3 Normalizing Observations

In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer

# load dataset
df = pd.read_csv("world_bank_dataset.csv") 

# create feature matrix
features = df[['GDP (USD)', 'Population']].values 

# create normalizer
normalizer = Normalizer(norm="l2")

# transform feature matrix
normalizer.transform(features[:10])  # print first 10 normalized rows



array([[9.99999846e-01, 5.55189371e-04],
       [9.99999999e-01, 5.10747208e-05],
       [9.99999999e-01, 4.07776766e-05],
       [1.00000000e+00, 9.57106007e-06],
       [1.00000000e+00, 1.11147691e-05],
       [9.99999998e-01, 6.74545826e-05],
       [9.99999976e-01, 2.19806281e-04],
       [1.00000000e+00, 1.24894108e-05],
       [9.99999994e-01, 1.08152786e-04],
       [9.99999998e-01, 5.62557865e-05]])

In [17]:
# L1 Normalization
features_l1_norm = Normalizer(norm="l1").transform(features)

print("Sum of the first observation's values: {}".format(
    features_l1_norm[0].sum()
))


Sum of the first observation's values: 1.0


# 3.4 Grouping Observations Using Clustering

In [33]:
import pandas as pd
from sklearn.cluster import KMeans

# load your dataset
df = pd.read_csv("world_bank_dataset.csv")

# create feature matrix
features = df[['CO2 Emissions (metric tons per capita)', 'Population']].values

# make k-means clusterer
clusterer = KMeans(3, random_state=0)

# fit clusterer
clusterer.fit(features)

# predict values
df['group'] = clusterer.predict(features)

df.head(6)


Unnamed: 0,Country,Year,GDP (USD),Population,Life Expectancy,Unemployment Rate (%),CO2 Emissions (metric tons per capita),Access to Electricity (%),group
0,Brazil,2010,1493220000000.0,829020000.0,66.7,3.81,10.79,76.76,0
1,Japan,2011,17562700000000.0,897010000.0,61.4,17.98,15.67,67.86,0
2,India,2012,16426880000000.0,669850000.0,69.1,16.02,2.08,81.08,0
3,Mexico,2013,11890010000000.0,113800000.0,80.1,6.26,19.13,53.46,1
4,India,2014,2673020000000.0,29710000.0,62.7,3.1,15.66,82.17,1
5,United Kingdom,2015,10711800000000.0,722560000.0,74.6,23.24,7.55,64.48,0


# 3.5 Deleting Observations with Missing Values

In [34]:
import pandas as pd

# load your dataset
df = pd.read_csv("world_bank_dataset.csv")

# select two columns
df2 = df[['CO2 Emissions (metric tons per capita)', 'Population']]

# drop rows with missing values
df2.dropna()[:5]


Unnamed: 0,CO2 Emissions (metric tons per capita),Population
0,10.79,829020000.0
1,15.67,897010000.0
2,2.08,669850000.0
3,19.13,113800000.0
4,15.66,29710000.0


In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# load your dataset
df = pd.read_csv("world_bank_dataset.csv")

# make feature matrix
features = df[['CO2 Emissions (metric tons per capita)', 'Population']].values

# standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# introduce a missing value
true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

# create imputer
mean_imputer = SimpleImputer(strategy="mean")

# impute values
features_mean_imputed = mean_imputer.fit_transform(standardized_features)

# compare true and imputed values
print("True Value: {}".format(true_value))
print("Imputed Value: {}".format(features_mean_imputed[0, 0]))


True Value: 0.12843492941911622
Imputed Value: -0.0006454016553723276
