# <span style="color:red">*Handling Numerical Data*</span>

### <span style="color:blue">*_Rescaling a Feature_*</span>
#### To rescale the values of a numerical feature to be between two values

In [2]:
import numpy as np
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
feature

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

In [8]:
from sklearn import preprocessing
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

# Transform the feature
scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

## <span style="color:red">*_Standardizing a Feature_*</span>
### To transform a fature to have a mean of 0 and a standard deviation of 1


In [9]:
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])

# Create scaler
scaler = preprocessing.StandardScaler()

# Transform the feature
standardized_feature = scaler.fit_transform(feature)
standardized_feature

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])

## <span style="color:red">*Transforming Feature*</span>
### To make a custom transformation to one or more features


In [22]:
# Transform method_1

def add_ten(x):
    return x+10
import numpy as np
features = np.array([[2,3],[2,3],[3,5]])

import pandas as pd
df = pd.DataFrame(features, columns=['feature_1','feature_2'])
print(df)
df1 = df.feature_1.apply(add_ten)
df2 = pd.concat([df1,df.feature_2],axis=1)
print(df2)

   feature_1  feature_2
0          2          3
1          2          3
2          3          5
   feature_1  feature_2
0         12          3
1         12          3
2         13          5


In [60]:
# Transform method_2
import pandas as pd

houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500,2500,1500,48000]

houses['Price_Per_Square']=houses.Price/houses.Square_Feet
houses.head()

Unnamed: 0,Price,Bathrooms,Square_Feet,Price_Per_Square
0,534433,2.0,1500,356.288667
1,392333,3.5,2500,156.9332
2,293222,2.0,1500,195.481333
3,4322032,116.0,48000,90.042333


## _Detecting a Outlier_
### To identify extreme obserations

In [24]:
# Define outlers
Q3 = Raw_df.PassengerId.quantile(0.75) #四分位
Q1 = Raw_df.PassengerId.quantile(0.25) #四分位
iqr = Q3-Q1
lower_bound = Q1-(iqr*1.5)
upper_bound = Q3+(iqr*1.5)
                           
Raw_df.loc[(Raw_df['PassengerId'] > upper_bound) | (Raw_df['PassengerId'] < lower_bound), 'Outlier'] = -1
Raw_df.loc[(Raw_df['PassengerId'] <= upper_bound) & (Raw_df['PassengerId'] >= lower_bound), 'Outlier'] = 1
Raw_df.head()

array([[10000, 10000],
       [    2,     3],
       [    3,     5]])

In [46]:
# Handling outliers
import pandas as pd

houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500,2500,1500,48000]

# Filter observations
houses[houses['Bathrooms']<20]

# Filter and mark outlier
def outlier(x):
    if x<=20:
        return 1
    else:
        return 0

houses['Outlier'] = houses['Bathrooms'].map(outlier)

## _Apply<span style="color:red">*(Series or DataFrame)*</span> Map(Series) & Applymap(DataFrame only)
### To transform data in a DataFrame/Series
<span style="color:blue">some *To transform data in a DataFrame/Series* text</span>

In [48]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}
df = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])

# Create a lambda function
capitalizer = lambda x: x.upper()

#apply() can apply a function along any axis of the dataframe
df['name'].apply(capitalizer)

# map() applies an operation over each element of a series
df['name'].map(capitalizer)

# applymap() applies a function to every single element in the entire dataframe.

# Drop the string variable so that applymap() can run
df = df.drop('name', axis=1)

# Return the square root of every cell in the dataframe
df.applymap(np.sqrt)

Unnamed: 0,coverage,reports,year
Cochice,5.0,2.0,44.855323
Pima,9.69536,4.898979,44.855323
Santa Cruz,7.549834,5.567764,44.866469
Maricopa,7.874008,1.414214,44.877611
Yuma,8.3666,1.732051,44.877611
