In [34]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold
random_state = 1

In [35]:
cols = ['alcohol_by_weight', 'rating', 'bitterness', 'nitrogen', 
        'turbidity', 'sugars', 'degree_of_fermentation', 
        'calorific_value', 'density', 'pH', 'colour', 'sulphites']

cols_new = ['alcohol_by_weight', 'bitterness', 'nitrogen', 
            'turbidity', 'sugars', 'degree_of_fermentation', 
            'calorific_value', 'density', 'pH', 'colour', 'sulphites', 'rating']

df = pd.read_table("beer_ratings.txt", names = cols)

df = df[cols_new]

In [36]:
scaler = RobustScaler() 
df = pd.DataFrame(scaler.fit_transform(df), columns = cols_new)

In [37]:
X = df.iloc[:, :-1] # Independent Variables

y = df.iloc[:, -1] # Dependent Variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [38]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)
lm.score(X_train, y_train)

0.4324571050764737

In [39]:
lm.score(X_test, y_test)

0.3619133824497095

In [40]:
predictions = cross_val_predict(model, X, y, cv = 100)

In [41]:
accuracy = metrics.r2_score(y, predictions)
print('Cross-Predicted Accuracy:', accuracy)

Cross-Predicted Accuracy: 0.38316318860724885


In [42]:
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=10, n_repeats = 10, random_state=1)
# evaluate model
scores = cross_val_score(lm, X, y, scoring = 'r2', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f +/-(%.3f)' % (scores.mean(), scores.std()))

Accuracy: 0.394 +/-(0.072)


In [43]:
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate model
scores = cross_val_score(lm, X, y, scoring = 'r2', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f +/-(%.3f)' % (scores.mean(), scores.std()))

Accuracy: 0.396 +/-(0.057)
