In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score, cohen_kappa_score
from sklearn.svm import SVR
import joblib

In [43]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [44]:
train_df = train_df.drop(['Id'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

In [45]:
X = train_df.drop('quality', axis=1)
y = train_df['quality']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard Scaler Models

## Classification Models

### Linear SVM Classifier

In [47]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svm = SVC(kernel='linear', probability=True, random_state=42)
pipeline = Pipeline([('scaler', scaler), ('svm', svm)])
params = {'svm__C': np.linspace(0.001, 5, 10), 'svm__decision_function_shape': ['ovo', 'ovr']}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=3, cv=kfold, scoring='neg_log_loss')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'std_cls_svm_linear.pkl')

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)
y_pred = grid_search.predict(X_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.094 total time=   0.7s
[CV 2/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.090 total time=   0.6s
[CV 3/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.064 total time=   0.6s
[CV 4/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.047 total time=   0.6s
[CV 5/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.042 total time=   0.7s
[CV 1/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.094 total time=   0.7s
[CV 2/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.090 total time=   0.6s
[CV 3/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.064 total time=   0.6s
[CV 4/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.047 total time=   0.6s
[CV 5/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.042 total time=   0.6

### Radial SVM Classifier

In [48]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svm = SVC(kernel='rbf', probability=True, random_state=42)
pipeline = Pipeline([('scaler', scaler), ('svm', svm)])
params = {'svm__C': np.linspace(0.001, 5, 10), 'svm__gamma': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=3, cv=kfold, scoring='neg_log_loss')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'std_cls_svm_rbf.pkl')

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)
y_pred = grid_search.predict(X_test)
print("Accuracy score: ", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.257 total time=   1.0s
[CV 2/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.176 total time=   0.9s
[CV 3/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.167 total time=   0.9s
[CV 4/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.134 total time=   0.9s
[CV 5/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.110 total time=   0.9s
[CV 1/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.118 total time=   0.9s
[CV 2/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.122 total time=   1.0s
[CV 3/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.085 total time=   1.0s
[CV 4/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.101 total time=   0.9s
[CV 5/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.083 total time=   1.0s
[CV 1/5] END svm__C=0.001, svm__gamma=1.1118888888888887;, score=-1.148 total

## Regression Models

### Linear SVM Regressor

In [53]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svr = SVR(kernel='linear')
pipeline = Pipeline([('scaler', scaler), ('svr', svr)])
params = {'svr__C': np.linspace(0.001, 5, 10), 'svr__epsilon': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=2, cv=kfold, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'std_reg_svr_linear.pkl')

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=1.1118888888888887; total time=   0.0s
[CV] END ......svr__C=0.001, svr__epsilon=1.11

### Radial SVM Regressor

In [50]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svr = SVR(kernel='rbf')
pipeline = Pipeline([('scaler', scaler), ('svr', svr)])
params = {'svr__C': np.linspace(0.001, 5, 10),'svr__gamma': np.linspace(0.001, 5, 10), 'svr__epsilon': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=2, cv=kfold, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'std_reg_svr_rbf.pkl')

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=1.111888888

# MinMax Scaler Models

## Classification Models

### Linear SVM Classifier

In [54]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = MinMaxScaler()
svm = SVC(kernel='linear', probability=True, random_state=42)
pipeline = Pipeline([('scaler', scaler), ('svm', svm)])
params = {'svm__C': np.linspace(0.001, 5, 10), 'svm__decision_function_shape': ['ovo', 'ovr']}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=3, cv=kfold, scoring='neg_log_loss')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'mm_cls_svm_linear.pkl')

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.133 total time=   0.7s
[CV 2/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.114 total time=   0.6s
[CV 3/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.053 total time=   0.6s
[CV 4/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.060 total time=   0.6s
[CV 5/5] END svm__C=0.001, svm__decision_function_shape=ovo;, score=-1.069 total time=   0.6s
[CV 1/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.133 total time=   0.6s
[CV 2/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.114 total time=   0.6s
[CV 3/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.053 total time=   0.6s
[CV 4/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.060 total time=   0.6s
[CV 5/5] END svm__C=0.001, svm__decision_function_shape=ovr;, score=-1.069 total time=   0.6

### Radial SVM Classifier

In [55]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = MinMaxScaler()
svm = SVC(kernel='rbf', probability=True, random_state=42)
pipeline = Pipeline([('scaler', scaler), ('svm', svm)])
params = {'svm__C': np.linspace(0.001, 5, 10), 'svm__gamma': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=3, cv=kfold, scoring='neg_log_loss')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'mm_cls_svm_rbf.pkl')

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.141 total time=   0.9s
[CV 2/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.154 total time=   0.9s
[CV 3/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.069 total time=   0.8s
[CV 4/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.168 total time=   0.9s
[CV 5/5] END ...svm__C=0.001, svm__gamma=0.001;, score=-1.118 total time=   0.9s
[CV 1/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.104 total time=   0.9s
[CV 2/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.116 total time=   0.9s
[CV 3/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.053 total time=   0.9s
[CV 4/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.064 total time=   0.9s
[CV 5/5] END svm__C=0.001, svm__gamma=0.5564444444444444;, score=-1.059 total time=   0.9s
[CV 1/5] END svm__C=0.001, svm__gamma=1.1118888888888887;, score=-1.103 total

## Regression Models

### Linear SVM Regressor

In [56]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svr = SVR(kernel='linear')
pipeline = Pipeline([('scaler', scaler), ('svr', svr)])
params = {'svr__C': np.linspace(0.001, 5, 10), 'svr__epsilon': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=2, cv=kfold, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'mm_reg_svr_linear.pkl')

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ...................svr__C=0.001, svr__epsilon=0.001; total time=   0.2s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=0.5564444444444444; total time=   0.1s
[CV] END ......svr__C=0.001, svr__epsilon=1.1118888888888887; total time=   0.0s
[CV] END ......svr__C=0.001, svr__epsilon=1.11

### Radial SVM Regressor 

In [57]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
svr = SVR(kernel='rbf')
pipeline = Pipeline([('scaler', scaler), ('svr', svr)])
params = {'svr__C': np.linspace(0.001, 5, 10),'svr__gamma': np.linspace(0.001, 5, 10), 'svr__epsilon': np.linspace(0.001, 5, 10)}
grid_search = GridSearchCV(pipeline, param_grid=params, verbose=2, cv=kfold, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Save the model to a pickle file
joblib.dump(grid_search.best_estimator_, 'mm_reg_svr_rbf.pkl')

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", -grid_search.best_score_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.3s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END .svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.001; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=0.5564444444444444; total time=   0.2s
[CV] END svr__C=0.001, svr__epsilon=0.001, svr__gamma=1.111888888

In [61]:
#function to load the model
def load_model(model_name):
    return joblib.load(model_name)

In [62]:
std_cls_svm_linear = load_model('std_cls_svm_linear.pkl')
std_cls_svm_rbf = load_model('std_cls_svm_rbf.pkl')
std_reg_svr_linear = load_model('std_reg_svr_linear.pkl')
std_reg_svr_rbf = load_model('std_reg_svr_rbf.pkl')
mm_cls_svm_linear = load_model('mm_cls_svm_linear.pkl')
mm_cls_svm_rbf = load_model('mm_cls_svm_rbf.pkl')
mm_reg_svr_linear = load_model('mm_reg_svr_linear.pkl')
mm_reg_svr_rbf = load_model('mm_reg_svr_rbf.pkl')


# Evaluation

In [80]:
#As the competition uses quadratic weighted kappa as the evaluation metric, we will use it as well to evaluate our models.
predictions_list = []

y_pred = std_cls_svm_linear.predict(X_test)
predictions_list.append({"std_cls_svm_linear":y_pred})
print("Quadratic weighted kappa of svm linear classifier with standard scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = std_cls_svm_rbf.predict(X_test)
predictions_list.append({"std_cls_svm_rbf":y_pred})
print("Quadratic weighted kappa of svm rbf classifier with standard scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = std_reg_svr_linear.predict(X_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 3, 9)
predictions_list.append({"std_reg_svr_linear":y_pred})
print("Quadratic weighted kappa of svr linear regressor with standard scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = std_reg_svr_rbf.predict(X_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 3, 9)
predictions_list.append({"std_reg_svr_rbf":y_pred})
print("Quadratic weighted kappa of svr rbf regressor with standard scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = mm_cls_svm_linear.predict(X_test)
predictions_list.append({"mm_cls_svm_linear":y_pred})
print("Quadratic weighted kappa of svm linear classifier with min max scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = mm_cls_svm_rbf.predict(X_test)
predictions_list.append({"mm_cls_svm_rbf":y_pred})
print("Quadratic weighted kappa of svm rbf classifier with min max scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = mm_reg_svr_linear.predict(X_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 3, 9)
predictions_list.append({"mm_reg_svr_linear":y_pred})
print("Quadratic weighted kappa of svr linear regressor with min max scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

y_pred = mm_reg_svr_rbf.predict(X_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 3, 9)
predictions_list.append({"mm_reg_svr_rbf":y_pred})
print("Quadratic weighted kappa of svr rbf regressor with min max scaler: ", cohen_kappa_score(y_test, y_pred, weights='quadratic'))

Quadratic weighted kappa of svm linear classifier with standard scaler:  0.3244831285352058
Quadratic weighted kappa of svm rbf classifier with standard scaler:  0.30915846361706234
Quadratic weighted kappa of svr linear regressor with standard scaler:  0.4151115663831114
Quadratic weighted kappa of svr rbf regressor with standard scaler:  0.41205767306467755
Quadratic weighted kappa of svm linear classifier with min max scaler:  0.31921847762928535
Quadratic weighted kappa of svm rbf classifier with min max scaler:  0.3440272874725694
Quadratic weighted kappa of svr linear regressor with min max scaler:  0.4151115663831114
Quadratic weighted kappa of svr rbf regressor with min max scaler:  0.41205767306467755


# Predictions on test data

In [79]:
test_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.2,0.51,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,8.4,0.46,0.4,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,8.0,0.47,0.4,1.8,0.056,14.0,25.0,0.9948,3.3,0.65,11.7
4,6.5,0.34,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8


In [81]:
#create a dictionary of models
model_dict = {
    "std_cls_svm_linear":std_cls_svm_linear,
    "std_cls_svm_rbf":std_cls_svm_rbf,
    "std_reg_svr_linear":std_reg_svr_linear,
    "std_reg_svr_rbf":std_reg_svr_rbf,
    "mm_cls_svm_linear":mm_cls_svm_linear,
    "mm_cls_svm_rbf":mm_cls_svm_rbf,
    "mm_reg_svr_linear":mm_reg_svr_linear,
    "mm_reg_svr_rbf":mm_reg_svr_rbf
}

In [82]:
#now using this ditctionary create 8 submission files on test data
for key, value in model_dict.items():
    y_pred = value.predict(test_df)
    y_pred = np.round(y_pred).astype(int)
    y_pred = np.clip(y_pred, 3, 9)
    sample_submission['quality'] = y_pred
    sample_submission.to_csv(key+'.csv', index=False)

| Model Name | Public Score on kaggle |
| --- | --- |
| mm_reg_svr_rbf | 0.5192 |
| mm_reg_svr_linear | 0.53266 |
| mm_cls_svm_rbf | 0.48773 |
| mm_cls_svm_linear | 0.42263 |
| std_reg_svr_rbf | 0.5192 |
| std_reg_svr_linear | 0.53266 |
| std_cls_svm_rbf | 0.45282 |
| std_cls_svm_linear | 0.42768 |

When treated as a regression problem better results are achieved