<a href="https://colab.research.google.com/github/Pmilivojevic/PyTorch/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install -U dtale
# !pip install -U six
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import dtale
import dtale.app as dtale_app
# dtale_ap.USE_NGROK = True
dtale_app.USE_COLAB = True

pd.pandas.set_option('display.max_columns', None)
pd.pandas.set_option('display.max_rows', None)

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ExploratoryDataAnalysis/datasets/train.csv')
df = sns.load_dataset('planets')

In [None]:
dtale.show(dataset)





https://og97w0ql39f-496ff2e9c6d22116-40000-colab.googleusercontent.com/dtale/main/1

In [None]:
features_nan = [
    feature for feature in dataset.columns
    if dataset[feature].isnull().sum() > 0 and
    dataset[feature].dtype == 'O'
]

for feature in features_nan:
  print(f'{feature}: {np.round(dataset[feature].isnull().mean(), 4)}')

In [None]:
def replace_nan_vals(dataset, nan_features):
  data = dataset.copy()

  data[nan_features] = data[nan_features].fillna('Missing')

  return data

In [None]:
dataset = replace_nan_vals(dataset, features_nan)
dataset[features_nan].isnull().sum()

In [None]:
features_num_nan = [
    feature for feature in dataset.columns
    if dataset[feature].isnull().sum() > 0 and
    dataset[feature].dtype != 'O'
]

for feature in features_num_nan:
  print(f'{feature} {np.round(dataset[feature].isnull().mean(), 4)}: missing values')

In [None]:
for feature in features_num_nan:
  median_val = dataset[feature].median()

  dataset[feature + '_nan'] = np.where(dataset[feature].isnull(), 1, 0)
  dataset[feature] = median_val

dataset[features_num_nan].isnull().sum()

In [None]:
temp_features = [
    feature for feature in dataset.columns
    if ('Yr' in feature or 'Year' in feature) and
    'Sold' not in feature and 'nan' not in feature
]

for feature in temp_features:
  dataset[feature] = dataset['YrSold'] - dataset[feature]

dataset[temp_features].head()

In [None]:
num_features=['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']

for feature in num_features:
  dataset[feature] = np.log(dataset[feature])

In [None]:
categorical_features = [
    feature for feature in dataset.columns
    if dataset[feature].dtype == 'O'
]

In [None]:
for feature in categorical_features:
  temp = dataset.groupby(feature)['SalePrice'].count()/len(dataset)
  temp_df = temp[temp > 0.01].index
  dataset[feature] = np.where(dataset[feature].isin(temp_df), dataset[feature], 'Rare_var')

In [None]:
dataset[categorical_features[-1]]

In [None]:
for feature in categorical_features:
  labels_ordered = dataset.groupby(feature)['SalePrice'].mean().sort_values().index
  labels_ordered = {k:i for i,k in enumerate(labels_ordered)}

  dataset[feature] = dataset[feature].map(labels_ordered)

In [None]:
features_scale = [feature for feature in dataset.columns if feature not in ['Id', 'SalePrice']]
scaler = MinMaxScaler()
scaler.fit(dataset[features_scale])

In [None]:
scaler.transform(dataset[features_scale])

In [None]:
data = pd.concat(
    [
        dataset[['Id', 'SalePrice']].reset_index(drop=True),
        pd.DataFrame(scaler.transform(dataset[features_scale]), columns=features_scale)
    ], axis=1
)

data.head()

In [None]:
Y_train = data[['SalePrice']]
X_train = data.drop(['Id', 'SalePrice'], axis=1)

In [None]:
feature_select_model = SelectFromModel(Lasso(alpha=0.005, random_state=0))
feature_select_model.fit(X_train, Y_train)

In [None]:
selected_features = X_train.columns[(feature_select_model.get_support())]

In [None]:
X_train[selected_features].head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.235294,0.75,0.636364,0.666667,0.098361,0.0,0.75,0.25,1.0,1.0,0.356155,0.577712,0.333333,0.666667,0.0,0.2,0.8,0.666667,0.5,1.0,0.75
1,0.0,0.75,0.5,0.555556,0.52459,0.0,0.75,1.0,1.0,1.0,0.503056,0.470245,0.0,0.333333,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
2,0.235294,0.75,0.636364,0.666667,0.114754,0.0,0.75,0.5,1.0,1.0,0.383441,0.593095,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
3,0.294118,0.75,0.727273,0.666667,0.606557,0.0,0.5,0.25,0.75,1.0,0.399941,0.579157,0.333333,0.666667,0.333333,0.8,0.4,0.333333,0.75,1.0,0.0
4,0.235294,0.75,1.0,0.777778,0.147541,0.0,0.75,0.75,1.0,1.0,0.466237,0.666523,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.75,1.0,0.75
