In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df_autos =  pd.read_csv('../CSVs/autos.csv')
df_houses =  pd.read_csv('../CSVs/ames.csv')
df_concrete =  pd.read_csv('../CSVs/concrete.csv')
df_customer =  pd.read_csv('../CSVs/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv')
df_houses.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition,SalePrice
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,,0.0,5,2010,WD,Normal,215000
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,0.0,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal,105000
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal,172000
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,,0.0,4,2010,WD,Normal,244000
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,0.0,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal,189900


## Mathematical Transforms

In [7]:
# Relationships among numerical features are often expressed through mathematical formulas, 
# The "stroke ratio", for instance, is a measure of how efficient an engine is versus how performant

df_autos["stroke_ratio"] = df_autos.stroke / df_autos.bore

df_autos[["stroke", "bore", "stroke_ratio"]].head()

Unnamed: 0,stroke,bore,stroke_ratio
0,2.68,3.47,0.772334
1,2.68,3.47,0.772334
2,3.47,2.68,1.294776
3,3.4,3.19,1.065831
4,3.4,3.19,1.065831


In [15]:
# Data visualization can suggest transformations, often a "reshaping" of a feature through powers or logarithms

# If the feature has 0.0 values, use np.log1p (log(1+x)) instead of np.log
df_accidents["LogWindSpeed"] = df_accidents.WindSpeed.apply(np.log1p)

# Plot a comparison
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
sns.kdeplot(df_accidents.WindSpeed, shade=True, ax=axs[0])
sns.kdeplot(df_accidents.LogWindSpeed, shade=True, ax=axs[1]);

In [21]:
# You can aggregate such features by creating a count

components = [ "Cement", "BlastFurnaceSlag", "FlyAsh", "Water",
               "Superplasticizer", "CoarseAggregate", "FineAggregate"]
df_concrete["Components"] = df_concrete[components].gt(0).sum(axis=1)

df_concrete[components + ["Components"]].head(10)

Unnamed: 0,Cement,BlastFurnaceSlag,FlyAsh,Water,Superplasticizer,CoarseAggregate,FineAggregate,Components
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,5
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,5
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,5
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,5
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,5
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,5
6,380.0,95.0,0.0,228.0,0.0,932.0,594.0,5
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,5
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,5
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,4


## Building-Up and Breaking-Down Features

In [24]:
# The Customer Lifetime Value dataset contains features describing customers of an insurance company.

df_customer[["Type", "Level"]] = (  # Create two new features
    df_customer["Policy"]           # from the Policy feature
    .str                         # through the string accessor
    .split(" ", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)

df_customer[["Policy", "Type", "Level"]].head(10)

Unnamed: 0,Policy,Type,Level
0,Corporate L3,Corporate,L3
1,Personal L3,Personal,L3
2,Personal L3,Personal,L3
3,Corporate L2,Corporate,L2
4,Personal L1,Personal,L1
5,Personal L3,Personal,L3
6,Corporate L3,Corporate,L3
7,Corporate L3,Corporate,L3
8,Corporate L3,Corporate,L3
9,Special L2,Special,L2


In [27]:
# You could also join simple features into a composed feature if you had reason to believe there was some interaction in the combination:
df_autos["make_and_style"] = df_autos["make"] + "_" + df_autos["body_style"]
df_autos[["make", "body_style", "make_and_style"]].head()

Unnamed: 0,make,body_style,make_and_style
0,alfa-romero,convertible,alfa-romero_convertible
1,alfa-romero,convertible,alfa-romero_convertible
2,alfa-romero,hatchback,alfa-romero_hatchback
3,audi,sedan,audi_sedan
4,audi,sedan,audi_sedan


## Group Transforms

In [29]:
# Finally we have Group transforms, which aggregate information across multiple rows grouped by some category
df_customer["AverageIncome"] = (
    df_customer.groupby("State")  # for each state
    ["Income"]                 # select the income
    .transform("mean")         # and compute its mean
)

df_customer[["State", "Income", "AverageIncome"]].head(10)

Unnamed: 0,State,Income,AverageIncome
0,Washington,56274,38122.733083
1,Arizona,0,37405.402231
2,Nevada,48767,38369.605442
3,California,0,37558.946667
4,Washington,43836,38122.733083
5,Oregon,62902,37557.283353
6,Oregon,55350,37557.283353
7,Arizona,0,37405.402231
8,Oregon,14072,37557.283353
9,Oregon,28812,37557.283353


In [31]:
# Other handy methods include max, min, median, var, std, and count. Here's how you could calculate the frequency with which each state occurs in the dataset:
df_customer["StateFreq"] = (
    df_customer.groupby("State")
    ["State"]
    .transform("count")
    / df_customer.State.count()
)

df_customer[["State", "StateFreq"]].head(10)

Unnamed: 0,State,StateFreq
0,Washington,0.087366
1,Arizona,0.186446
2,Nevada,0.096562
3,California,0.344865
4,Washington,0.087366
5,Oregon,0.28476
6,Oregon,0.28476
7,Arizona,0.186446
8,Oregon,0.28476
9,Oregon,0.28476


In [36]:
# You could use a transform like this to create a "frequency encoding" for a categorical feature.

# Create splits
df_train = df_customer.sample(frac=0.5)
df_valid = df_customer.drop(df_train.index)

# Create the average claim amount by coverage type, on the training set
df_train["AverageClaim"] = df_train.groupby("Coverage")["Total Claim Amount"].transform("mean")

# Merge the values into the validation set
df_valid = df_valid.merge(
    df_train[["Coverage", "AverageClaim"]].drop_duplicates(),
    on="Coverage",
    how="left",
)

df_valid[["Coverage", "AverageClaim"]].head(10)

Unnamed: 0,Coverage,AverageClaim
0,Basic,379.417662
1,Extended,483.466613
2,Premium,651.341878
3,Basic,379.417662
4,Basic,379.417662
5,Extended,483.466613
6,Basic,379.417662
7,Premium,651.341878
8,Basic,379.417662
9,Basic,379.417662


##  Create Mathematical Transforms

In [4]:
# ratios and sums.
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = df_houses.GrLivArea / df_houses.LotArea
X_1["Spaciousness"] = (df_houses.FirstFlrSF + df_houses.SecondFlrSF) / df_houses.TotRmsAbvGrd
X_1["TotalOutsideSF"] =df_houses.WoodDeckSF + df_houses.OpenPorchSF + df_houses.EnclosedPorch + df_houses.Threeseasonporch + df_houses.ScreenPorch

X_1.head()

Unnamed: 0,LivLotRatio,Spaciousness,TotalOutsideSF
0,0.052125,236.571429,272.0
1,0.077095,179.2,260.0
2,0.093152,221.5,429.0
3,0.189068,263.75,0.0
4,0.117787,271.5,246.0


In [6]:
# If you've discovered an interaction effect between a numeric feature and a categorical feature, you might want to model it explicitly using a one-hot encoding.
# We discovered an interaction between BldgType and GrLivArea in Exercise 2. Now create their interaction features.

# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(df_houses.BldgType, prefix="Bldg")

# Multiply
X_2 = X_2.mul(df_houses.GrLivArea, axis=0)

X_2.head()

Unnamed: 0,Bldg_Duplex,Bldg_OneFam,Bldg_Twnhs,Bldg_TwnhsE,Bldg_TwoFmCon
0,0.0,1656.0,0.0,0.0,0.0
1,0.0,896.0,0.0,0.0,0.0
2,0.0,1329.0,0.0,0.0,0.0
3,0.0,2110.0,0.0,0.0,0.0
4,0.0,1629.0,0.0,0.0,0.0


In [8]:
# Let's try creating a feature that describes how many kinds of outdoor areas a dwelling has. Create a feature PorchTypes that counts how many of the  outdoor areas features are greater than 0.0:

X_3 = pd.DataFrame()

components = [ 'WoodDeckSF' ,'OpenPorchSF' ,'EnclosedPorch' ,'Threeseasonporch' ,'ScreenPorch']

X_3["PorchTypes"] = df_houses[components].gt(0).sum(axis=1)
X_3.head()

Unnamed: 0,PorchTypes
0,2
1,2
2,2
3,0
4,2


## Break Down a Categorical Feature

In [10]:
# MSSubClass describes the type of a dwelling
df_houses.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [16]:
# Create a feature containing only these first words by splitting MSSubClass at the first underscore _. (Hint: In the split method use an argument n=1.)

X_4 = pd.DataFrame()

X_4["MSClass"] = df_houses.MSSubClass.str.split("_", n=1, expand=True)[0]

X_4.head()

Unnamed: 0,MSClass
0,One
1,One
2,One
3,One
4,Two


##  Use a Grouped Transform

In [19]:
# The value of a home often depends on how it compares to typical homes in its neighborhood. Create a feature MedNhbdArea that describes the median of GrLivArea grouped on Neighborhood.

X_5 = pd.DataFrame()

X_5["MedNhbdArea"] = df_houses.groupby("Neighborhood")["GrLivArea"].transform("median")

X_5.head()

Unnamed: 0,MedNhbdArea
0,1200.0
1,1200.0
2,1200.0
3,1200.0
4,1560.0
