In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('winequality-white.csv', sep = ';')

In [5]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


---

### 1. Separate the target feature[‘quality], split data in 7:3 proportion (30% form a holdout set, use random_state=17), and preprocess data with StandardScaler

In [6]:
X = df.drop('quality', axis = 1)
y = df['quality']

In [7]:
from sklearn.model_selection import train_test_split

#### Splitting the Data with 30% as Holdout Set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 17)

#### Scaling the Data

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#### Creating variables for Scaled X_train and X_test

In [11]:
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.fit_transform(X_test)

---

### 2. Train a simple linear regression model using sci-kit learn.

In [12]:
from sklearn.linear_model import LinearRegression

#### Initializing LinearRegression() object and fitting it

In [13]:
linreg = LinearRegression()
linreg.fit(X_train_scale, y_train)

LinearRegression()

---

### 3. What are mean squared errors of model predictions on train and holdout sets?

In [15]:
from sklearn.metrics import mean_squared_error

In [17]:
y_train_pred = linreg.predict(X_train_scale)
y_test_pred = linreg.predict(X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [90]:
round(mean_squared_error(y_train, y_train_pred), 2)

0.56

#### Mean Squared Error of Model Predictions on Holdout Set

In [91]:
round(mean_squared_error(y_test, y_test_pred), 2)

0.58

---

### 4. Create a data frame to display coefficients of each features.

In [120]:
coef_df = pd.DataFrame({'Features': X.columns, 'Coeff.': linreg.coef_})
coef_df['Coeff.'] = coef_df['Coeff.'].round(3)

In [121]:
coef_df

Unnamed: 0,Features,Coeff.
0,fixed acidity,0.098
1,volatile acidity,-0.192
2,citric acid,-0.0
3,residual sugar,0.538
4,chlorides,0.008
5,free sulfur dioxide,0.042
6,total sulfur dioxide,0.014
7,density,-0.666
8,pH,0.15
9,sulphates,0.062


---

### 5. Which feature this linear regression model treats as the most influential on wine quality?

In [122]:
coef_df.sort_values(by = 'Coeff.', ascending = False)

Unnamed: 0,Features,Coeff.
3,residual sugar,0.538
8,pH,0.15
10,alcohol,0.13
0,fixed acidity,0.098
9,sulphates,0.062
5,free sulfur dioxide,0.042
6,total sulfur dioxide,0.014
4,chlorides,0.008
2,citric acid,-0.0
1,volatile acidity,-0.192


#### Thus, 'residual sugar' influences most POSITIVELY with coefficent of 0.538 while 'density' affects quality influences most NEGATIVELY with coefficient -0.666

---

### 6. Train a LASSO model with α=0.01 and scaled data. Again, set random_state=17.

In [28]:
from sklearn.linear_model import Lasso

#### Initializing Lasso() object and fitting it, random_state 17 and max_iter = 250)

In [30]:
lasso = Lasso(alpha = 0.01, max_iter = 250, random_state = 17)

In [31]:
lasso.fit(X_train_scale, y_train)

Lasso(alpha=0.01, max_iter=250, random_state=17)

---

### 7. Which feature is the least informative in predicting wine quality, according to this LASSO model?

#### Building a dataframe with Features and their Coefficient

In [124]:
coef_lasso_df = pd.DataFrame({'Features': X.columns, 'Coeff.': lasso.coef_})
coef_lasso_df['Coeff.'] = coef_lasso_df['Coeff.'].round(3)

In [125]:
coef_lasso_df.sort_values(by = 'Coeff.', ascending = False)

Unnamed: 0,Features,Coeff.
10,alcohol,0.322
3,residual sugar,0.256
8,pH,0.067
5,free sulfur dioxide,0.043
9,sulphates,0.03
0,fixed acidity,-0.0
2,citric acid,-0.0
6,total sulfur dioxide,-0.0
4,chlorides,-0.003
1,volatile acidity,-0.188


#### Thus, 'fixed acidity', 'citric acid', 'total sulfur dioxide' are the least informative according to Lasso Model with coefficient of 0.00

---

### 8. Train LassoCV with random_state=17 to choose the best value of α- alpha in 5-fold cross-validation.(use LassoCV instead of Gridsearch). The list of alphas to be passed are [0.01,0.001,0.1,0.2,0.02,0.002].

In [35]:
from sklearn.linear_model import LassoCV

In [36]:
best_lasso = LassoCV(alphas = [0.01,0.001,0.1,0.2,0.02,0.002], cv = 5, random_state = 17)

#### Initializing LassoCV() object and fitting it, with alpha = [0.01,0.001,0.1,0.2,0.02,0.002], cv = 5 and random_state 17

In [38]:
best_lasso.fit(X_train_scale, y_train)

LassoCV(alphas=[0.01, 0.001, 0.1, 0.2, 0.02, 0.002], cv=5, random_state=17)

In [41]:
best_lasso.alpha_

0.001

#### Thus, from LassoCV we get  best alpha value of 0.001

---

### 9. Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

In [127]:
coef_best_lasso_df = pd.DataFrame({'Features': X.columns, 'Coeff.': best_lasso.coef_})
coef_best_lasso_df['Coeff.'] = coef_best_lasso_df['Coeff.'].round(3)

In [129]:
coef_best_lasso_df.sort_values(by = 'Coeff.', ascending = False)

Unnamed: 0,Features,Coeff.
3,residual sugar,0.498
10,alcohol,0.156
8,pH,0.138
0,fixed acidity,0.082
9,sulphates,0.058
5,free sulfur dioxide,0.044
6,total sulfur dioxide,0.01
4,chlorides,0.004
2,citric acid,0.0
1,volatile acidity,-0.192


#### Thus, 'citric acid' is least informative according to the Tuned Lasso Model with a coefficient of 0.00

---

### 10. What are mean squared errors of tuned LASSO predictions on train and holdout sets?

In [47]:
# Building Lassso with best params

In [48]:
tuned_lasso = Lasso(alpha = 0.001, max_iter = 250, random_state = 17)

#### Initializing tuned Lasso() object and fitting it, with alpha = 0.001 and random_state 17

In [50]:
tuned_lasso.fit(X_train_scale, y_train)

Lasso(alpha=0.001, max_iter=250, random_state=17)

In [53]:
y_train_pred_tuned_lasso = tuned_lasso.predict( X_train_scale)
y_test_pred_tuned_lasso = tuned_lasso.predict( X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [95]:
round(mean_squared_error(y_train, y_train_pred_tuned_lasso),2)

0.56

#### Mean Squared Error of Model Predictions on Holdout Set

In [96]:
round(mean_squared_error(y_test, y_test_pred_tuned_lasso),2)

0.58

---

### 11. Train a Random Forest, setting only random_state to be 17

In [56]:
from sklearn.ensemble import RandomForestRegressor

#### Initializing RandomForestRegressor() object and fitting it, random_state 17

In [57]:
random_reg = RandomForestRegressor(random_state = 17)

In [58]:
random_reg.fit(X_train_scale, y_train)

RandomForestRegressor(random_state=17)

---

### 12. What are mean squared errors of tuned randomforest predictions on train and holdout sets?

In [59]:
y_train_pred_random_reg = random_reg.predict(X_train_scale)
y_test_pred_random_reg = random_reg.predict(X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [132]:
round(mean_squared_error(y_train, y_train_pred_random_reg), 2)

0.05

#### Mean Squared Error of Model Predictions on Holdout Set

In [99]:
round(mean_squared_error(y_test, y_test_pred_random_reg),2)

0.41

---

### 13. Tune the max_features and max_depth hyperparameters with GridSearchCV and again check mean cross-validation MSE and MSE on holdout set. Parameters to tune 

### forest_params = {'max_depth': list(range(10, 25)),'max_features': list(range(6,12))}


In [62]:
from sklearn.model_selection import GridSearchCV

In [63]:
forest_params = [{'max_depth': list(range(10, 25)), 'max_features': list(range(6,12))}]

#### Initializing GridSearchCV() object and fitting it with forest_params, and cv = 10

In [64]:
rf_gcv = GridSearchCV(RandomForestRegressor(random_state = 17), forest_params, cv=10)

In [66]:
rf_gcv.fit(X_train_scale, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=17),
             param_grid=[{'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                        20, 21, 22, 23, 24],
                          'max_features': [6, 7, 8, 9, 10, 11]}])

In [68]:
rf_gcv.best_params_

{'max_depth': 22, 'max_features': 6}

#### Thus, we got best params as max_depth = 22 and max_features = 6

#### Building again RandomForestRegressor Model with Best Params from Grid Search CV

In [70]:
rf_tuned = RandomForestRegressor(max_depth = 22, max_features = 6, random_state = 17)

In [71]:
rf_tuned.fit(X_train_scale, y_train)

RandomForestRegressor(max_depth=22, max_features=6, random_state=17)

In [72]:
y_test_pred_rf_tuned = rf_tuned.predict(X_test_scale)

#### Finding the MSE of Holdout Set

In [106]:
round(mean_squared_error(y_test, y_test_pred_rf_tuned), 3)

0.403

In [133]:
from sklearn.model_selection import cross_val_score

#### Cross Validation Score for each run of Cross Validation

In [134]:
cross_val_score(RandomForestRegressor(max_depth = 22, max_features = 6, random_state = 17), X_train_scale, y_train)

array([0.47249199, 0.51006404, 0.47004673, 0.49422968, 0.49448858])

---

### 14. Output RF's feature importance. Again, it's nice to present it as a DataFrame. What is the most important feature, according to the Random Forest model?

In [130]:
coef_rf_tuned_df = pd.DataFrame({'Features': X.columns, 'Importance': rf_tuned.feature_importances_})
coef_rf_tuned_df['Importance'] = coef_rf_tuned_df['Importance'].round(3)

In [131]:
coef_rf_tuned_df.sort_values(by = 'Importance', ascending = False)

Unnamed: 0,Features,Importance
10,alcohol,0.206
1,volatile acidity,0.116
5,free sulfur dioxide,0.113
7,density,0.089
6,total sulfur dioxide,0.075
3,residual sugar,0.073
8,pH,0.073
4,chlorides,0.072
2,citric acid,0.064
0,fixed acidity,0.061


#### Thus, **'alcohol'** is the most important factor according to the Random Forest Regressor with Tuned Parameters

---