In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Normalizer
from sklearn.metrics import cohen_kappa_score
from sklearn.svm import SVC

In [2]:
#import data
sample_submission_data = pd.read_csv('sample_submission.csv')
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
wine_extra_data = pd.read_csv('WineQT.csv')

In [3]:
print(f'Size of train: {train_data.shape}')
print(f'Size of test: {test_data.shape}')
print(f'Size of sample: {sample_submission_data.shape}')
print(f'Size of extradata: {wine_extra_data.shape}')

Size of train: (2056, 13)
Size of test: (1372, 12)
Size of sample: (1372, 2)
Size of extradata: (1143, 13)


In [4]:
#add extra data to train
train_data = train_data.append(wine_extra_data, ignore_index = True)

In [5]:
X = train_data.drop(['Id', 'quality'], axis=1)
y = train_data.quality

In [57]:
transformer = Normalizer()
X_norm_list = transformer.transform(X)
X = pd.DataFrame(X_norm_list, columns=list(X))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [7]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3
3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8
4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5


In [8]:
#modeling

In [130]:
rf = RandomForestClassifier(criterion='entropy')
parametrs = {'n_estimators':range(60, 80, 4),
            'max_depth':range(1, 30, 6),
            'min_samples_split':range(5, 56, 10),
            'min_samples_leaf':range(2, 11, 2)}
search = GridSearchCV(rf, parametrs, cv=3, n_jobs=-1)
search.fit(X_train, y_train)
model = search.best_estimator_
prediction = model.predict(X_test)
score = cohen_kappa_score(y_test, prediction)

In [131]:
score

0.3558247903075489

In [132]:
model

RandomForestClassifier(criterion='entropy', max_depth=25, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=64)

In [142]:
results = pd.DataFrame(columns=['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'score'])
for n_estimators in range(1, 50, 1):
    for max_depth in range(90, 91):
        for min_samples_split in range(5, 6):
            for min_samples_leaf in range(5, 6):
                rf = RandomForestClassifier(criterion='entropy',
                                            n_estimators=n_estimators,
                                            max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            min_samples_leaf=min_samples_leaf)
                rf.fit(X_train, y_train)
                prediction = rf.predict(X_test)
                score = cohen_kappa_score(y_test, prediction)
                
                pred_df = pd.DataFrame({'n_estimators':[n_estimators],
                              'max_depth':[max_depth],
                              'min_samples_split':[min_samples_split],
                              'min_samples_leaf':[min_samples_leaf],
                              'score':score})
                results = results.append(pred_df, ignore_index = True)
results.sort_values('score',ascending=False).head()

Unnamed: 0,n_estimators,max_depth,min_samples_split,min_samples_leaf,score
30,31,90,5,5,0.360668
31,32,90,5,5,0.357494
45,46,90,5,5,0.346709
13,14,90,5,5,0.346589
23,24,90,5,5,0.345887


In [143]:
best_moder_so_far = RandomForestClassifier(n_estimators=31, max_depth=90, min_samples_split=5, min_samples_leaf=5)

In [9]:
svc = SVC()
svc.fit(X_train, y_train)
prediction = svc.predict(X_test)
score = cohen_kappa_score(y_test, prediction)