# Importing the Libraries

In [198]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the Dataset

In [199]:
dataset = pd.read_csv('winequality-red.csv')

In [200]:
print(dataset)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [201]:
mv = dataset.isnull()

In [202]:
print(mv)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0             False             False        False           False      False   
1             False             False        False           False      False   
2             False             False        False           False      False   
3             False             False        False           False      False   
4             False             False        False           False      False   
...             ...               ...          ...             ...        ...   
1594          False             False        False           False      False   
1595          False             False        False           False      False   
1596          False             False        False           False      False   
1597          False             False        False           False      False   
1598          False             False        False           False      False   

      free sulfur dioxide  

In [203]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [204]:
dataset = dataset.drop_duplicates()

In [205]:
duplicates = dataset.duplicated().sum()
print("Number of duplicate rows:", duplicates)


Number of duplicate rows: 0


In [206]:
print(X)

[[ 7.4   0.7   0.   ...  3.51  0.56  9.4 ]
 [ 7.8   0.88  0.   ...  3.2   0.68  9.8 ]
 [ 7.8   0.76  0.04 ...  3.26  0.65  9.8 ]
 ...
 [ 6.3   0.51  0.13 ...  3.42  0.75 11.  ]
 [ 5.9   0.65  0.12 ...  3.57  0.71 10.2 ]
 [ 6.    0.31  0.47 ...  3.39  0.66 11.  ]]


In [207]:
print(y)

[5 5 5 ... 6 5 6]


# Splitting Data into Train and Test Set and Feature Scaling

In [208]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)


sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# Training the Multiple Linear Regression Model

In [209]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [210]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5.55 5.  ]
 [5.36 6.  ]
 [6.11 6.  ]
 ...
 [6.48 6.  ]
 [5.83 6.  ]
 [5.46 5.  ]]


In [211]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.3404768167622618

# Training the Polynomial Regression Model

In [212]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [213]:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ -7.45   5.  ]
 [  5.08   6.  ]
 [  8.4    6.  ]
 ...
 [ 10.43   6.  ]
 [805.11   6.  ]
 [ 12.76   5.  ]]


In [214]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

-12589.655748051418

# Training the Random Forest Regression Model

In [215]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)

In [216]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5.78 5.  ]
 [5.51 6.  ]
 [6.18 6.  ]
 ...
 [6.18 6.  ]
 [5.61 6.  ]
 [5.2  5.  ]]


In [217]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.4358079753246199

# Training the Decision Tree Regression Model

In [218]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=4, min_samples_split=10, random_state=0)
regressor.fit(X_train, y_train)

In [219]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5.96 5.  ]
 [5.96 6.  ]
 [5.96 6.  ]
 ...
 [6.41 6.  ]
 [5.21 6.  ]
 [5.34 5.  ]]


In [220]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.26527479424573874