<span style=font-size:25px;>Feature Engineering</span></br>
</br>
<span style=font-size:20px;>
The following steps is carried out here:</br>
1. Missing Values</br>
2. Temporal Variables</br>
3. Categorical Variables: remove rare labels</br>
4. Standardize the values of the variables to the same range</br>
</span>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# to visualize all columns in the dataset
pd.pandas.set_option('display.max_columns',None)

dataset = pd.read_csv('train.csv')
dataset.head()

### To prevent data leakage we split the data first and perform feature-engineering

In [2]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(dataset, dataset['SalePrice'],test_size=.1,random_state=42)

NameError: name 'dataset' is not defined

In [None]:
X_train.shape, X_test.shape

<span style=font-size:25px;>Missing Values </span></br></br>
<span style=font-size:20px;>Firstly, We will handle the missing categorical features in the dataset.</span></br>

In [None]:
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtype=='O']

for feature in features_nan:
    print("{}: {}% missing values".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
## Replace missing value with a new label

def replace_cat_feature(dataset,features_nan):
    data = dataset.copy()
    data[features_nan] = data[features_nan].fillna('Missing')
    return data

dataset = replace_cat_feature(dataset,features_nan)
dataset[features_nan].isnull().sum()

<span style=font-size:20px;>Mising Numerical Values in Dataset.</span></br>

In [None]:
numerical_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']

# Printing Numerical NAN values and percentages of missing values

for feature in numerical_with_nan:
    print( "{}: {}% missing value".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
# When we have outliers we replace the NAN values with median or mode .
# Replacing the numerical missing values.

for feature in numerical_with_nan:
    # We will replace by using median since there is outlier.
    median_value = dataset[feature].median()
    
    dataset[feature=='nan']=np.where(dataset[feature].isnull(),1,0)
    dataset[feature].fillna(median_value,inplace=True)

dataset[numerical_with_nan].isnull().sum()

In [None]:
dataset.head(50)

## Temporal Variable (Date-Time Variable)

In [None]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    dataset[feature] = dataset['YrSold'] - dataset[feature]

In [None]:
dataset.head()

In [None]:
dataset[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

# Numerical Variables
<span style=font-size:20px;>We will log-normal distribution to handle missing data </span></br></br>

In [None]:
dataset.head()

In [None]:
import numpy as np
num_features = ['LotFrontage','LotArea','1stFlrSF','GrLivArea','SalePrice']

for feature in num_features:
    dataset[feature] = np.log(dataset[feature])

In [None]:
dataset.head()


<span style=font-size:25px;>Handling Rare Categorical Feature</span></br></br>
<span style=font-size:18px;> We will remove the categorical variables that are present less than 1% of the observations.</span></br>

In [None]:
categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype=='O']

In [None]:
categorical_features

In [None]:
for feature in categorical_features:
    temp = dataset.groupby(feature)['SalePrice'].count()/len(dataset)
    temp_df = temp[temp>0.01].index
    dataset[feature] = np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')

In [None]:
dataset.head(100)

In [None]:
# Feature Scaling

In [None]:
#feature_scale = [feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

# Selecting only numeric features for scaling
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(dataset[numeric_features])

In [None]:
scaler.transform(dataset[numeric_features])

In [None]:
# Transforimg the train and test set and add on the ID and SalePrice Variables.
data= pd.concat([dataset[['Id','SalePrice']].reset_index(drop=True),
                     pd.DataFrame(scaler.transform(dataset[numeric_features]),columns=numeric_features)],axis=1)

In [None]:
data.to_csv('X_train.csv',index=False)