In [1]:
import pandas as pd

# Define the file path
file_path = "winequality-white.csv"

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path, sep=';')

# Display the first few rows of the DataFrame
print(df.head())   

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6 

### Data Preprocessing

In [7]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

### Define the Training Features and Target Variable

In [10]:
# Define input features (X)
X = df.drop('quality', axis=1)  

# Define target variable (y)
y = df['quality']  


### Split the Dataset into Training and Testing Portion

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 5)

### Scale the Features

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

Sometimes I use minmax scaler as well

### Model Training First Version

In [13]:
from sklearn.svm import SVR
svr = SVR().fit(X_train_scaler, y_train)

### Model Evaluation

In [14]:
from sklearn.metrics import mean_absolute_error
y_pred = svr.predict(X_test_scaler)
mean_absolute_error(y_test, y_pred)

0.495342396522464

### Model Training Second Version

In [15]:
from sklearn.svm import SVR
svr1 = SVR(kernel = 'linear').fit(X_train_scaler, y_train)

### Model Evaluation

In [16]:
y_pred1 = svr1.predict(X_test_scaler)
mean_absolute_error(y_test, y_pred1)

0.5550017448735088

In [17]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'epsilon': [0.1, 0.2, 0.3]
}

# Create SVR model
svr = SVR()

# Perform GridSearchCV for parameter tuning
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaler, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

# Predict on the test set using the best estimator
y_pred = grid_search.best_estimator_.predict(X_test_scaler)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)  


Best Parameters: {'C': 1.0, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.48921363301655446
Mean Squared Error on Test Set: 0.42264380645679683
