# All tutorior about Feature Engineering

## 1. Evaluate model using Cross-Valuation

In [1]:
# Import libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [None]:
# Read input data
df = pd.read_csv("file.name")
X = df.copy()
y = X.pop("column.name")

In [None]:
# Evaluate model base on Cross-Valuation scores
model = RandomForestRegressor(criterion="mae", random_state=0)
scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
scores = -1*scores.mean() # mean of all scores (5 scores in this case)

## 2. Apply Mutual Information to measure a relationship between two quantities

In [None]:
# Import libraries
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression


The scikit-learn algorithm for MI treats discrete features differently from continuous features. 
Consequently, you need to tell it which are which. As a rule of thumb, anything that must have a float 
dtype is not discrete. Categoricals (object or categorial dtype) can be treated as discrete by giving them a label encoding.


In [None]:
# Function calculates MI scores
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
# Function for plotting MI scores
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")