In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso,Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
    
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)    

In [3]:
def fit_predict(train, test, y_train, y_test, scaler = None):
    if scaler is None:
        lr = Ridge()
        lr.fit(train, y_train)
        y_pred = lr.predict(test)
        print('MAE score:', mean_absolute_error(y_test, y_pred))
    else:
        train_scaled = scaler.fit_transform(train)
        test_scaled = scaler.transform(test)
        lr = Ridge()
        lr.fit(train_scaled, y_train)
        y_pred = lr.predict(test_scaled)
        print('MAE score:', mean_absolute_error(y_test, y_pred))

fit_predict(train,test,y_train,y_test)

MAE score: 0.59404938801


In [4]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)

In [5]:
def get_feat_imp(train,y_train,alpha=0.01):
    lr = Lasso(alpha=alpha)
    lr.fit(train,y_train)
    return lr.coef_
fi = get_feat_imp(train,y_train)
print('non zero features:',np.sum(fi != 0))

non zero features: 8


In [6]:
bestf = np.argwhere(fi)
train_best = train.iloc[:, [x[0] for x in bestf.tolist()]]
test_best = test.iloc[:, [x[0] for x in bestf.tolist()]]

In [7]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [8]:
for degree in [1,2,3,4]:
    train_poly,test_poly = create_poly(train,test,degree)
    print('No feature selection degree',degree)
    fit_predict(train_poly,test_poly,y_train,y_test,StandardScaler())
    print(10*'-')

No feature selection degree 1
MAE score: 0.579732117236
----------
No feature selection degree 2
MAE score: 0.566008983832
----------
No feature selection degree 3
MAE score: 0.557362504716
----------
No feature selection degree 4
MAE score: 0.569031127402
----------


In [9]:
for degree in [1,2,3,4]:
    train_poly,test_poly = create_poly(train_best,test_best,degree)
    print('Feature selection degree',degree)
    fit_predict(train_poly,test_poly,y_train,y_test,StandardScaler())
    print(10*'-')

Feature selection degree 1
MAE score: 0.586503532959
----------
Feature selection degree 2
MAE score: 0.575863518226
----------
Feature selection degree 3
MAE score: 0.571050244983
----------
Feature selection degree 4
MAE score: 0.626462712407
----------


In [10]:
original_score = 0.59404938801
best_score = 0.557362504716
improvement = np.round(100*(original_score - best_score)/original_score,2)
print('overall improvement is {} %'.format(improvement))

overall improvement is 6.18 %
