In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

In [2]:
x_train = pd.read_csv('data/train.csv', index_col=0)
y_train = x_train.pop('quality')
x_test = pd.read_csv('data/test.csv', index_col=0)

In [3]:
import warnings
warnings.filterwarnings(action='ignore')
# Categorical
x_train['type'][x_train['type']=='red'] = 0
x_train['type'][x_train['type']=='white'] = 1
x_test['type'][x_test['type']=='red'] = 0
x_test['type'][x_test['type']=='white'] = 1

In [4]:
# Normalization
for i, (k, v) in enumerate(x_train.items()):
    if k == 'type': 
        continue
    mean, std = v.mean(), v.std()
    x_train[k] = (x_train[k]-mean)/std
    x_test[k] = (x_test[k]-mean)/std

In [5]:
# Feature Engineering with PCA

# PCA1
feature = ['alcohol','density']

x = x_train[feature]
x_= x_test[feature]
pca = PCA(n_components=1)
pca.fit(x)
x_pca = pca.transform(x)
x_pca_= pca.transform(x_)
x_train = x_train.drop(feature, axis=1)
x_test = x_test.drop(feature, axis=1)
x_train['pca1'] = x_pca
x_test['pca1'] = x_pca_

# PCA2
feature = ['total sulfur dioxide','free sulfur dioxide']

x = x_train[feature]
x_= x_test[feature]
pca = PCA(n_components=1)
pca.fit(x)
x_pca = pca.transform(x)
x_pca_= pca.transform(x_)
x_train = x_train.drop(feature, axis=1)
x_test = x_test.drop(feature, axis=1)
x_train['pca2'] = x_pca
x_test['pca2'] = x_pca_

x_train.shape, x_test.shape

((5497, 10), (1000, 10))

In [6]:
# train
model = RandomForestClassifier(random_state=123)
model.fit(x_train, y_train)

# prediction
prediction = model.predict(x_test)
df = pd.DataFrame({
    'index':range(1000),
    'quality':prediction
})
df.to_csv('submission.csv', index=False)