In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso,Ridge
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
    
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)    

In [3]:
def fit_predict(train, test, y_train, y_test, scaler = None):
    if scaler is None:
        lr = Ridge()
        lr.fit(train, y_train)
        y_pred = lr.predict(test)
        print('MAE score:', mean_absolute_error(y_test, y_pred))
    else:
        train_scaled = scaler.fit_transform(train)
        test_scaled = scaler.transform(test)
        lr = Ridge()
        lr.fit(train_scaled, y_train)
        y_pred = lr.predict(test_scaled)
        print('MAE score:', mean_absolute_error(y_test, y_pred))

fit_predict(train,test,y_train,y_test)

MAE score: 0.59404938801


In [4]:
def get_feat_imp(train,y_train,alpha=0.01):
    lr = Lasso(alpha=alpha)
    lr.fit(train,y_train)
    return lr.coef_
fi = get_feat_imp(train,y_train)
print('non zero features:',np.sum(fi != 0))

non zero features: 6


In [5]:
bestf = np.argwhere(fi)
train_best = train.iloc[:, [x[0] for x in bestf.tolist()]]
test_best = test.iloc[:, [x[0] for x in bestf.tolist()]]

In [6]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [7]:
for degree in [1,2,3]:
    train_poly,test_poly = create_poly(train,test,degree)
    print('No feature selection degree',degree)
    fit_predict(train_poly,test_poly,y_train,y_test)
    print(10*'-')

No feature selection degree 1
MAE score: 0.59404938801
----------
No feature selection degree 2
MAE score: 0.577238983011
----------
No feature selection degree 3
MAE score: 0.596958634563
----------


In [8]:
for degree in [1,2,3]:
    train_poly,test_poly = create_poly(train_best,test_best,degree)
    print('Feature selection degree',degree)
    fit_predict(train_poly,test_poly,y_train,y_test)
    print(10*'-')

Feature selection degree 1
MAE score: 0.597972321004
----------
Feature selection degree 2
MAE score: 0.591541808012
----------
Feature selection degree 3
MAE score: 0.597630769778
----------
