In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
import opendatasets as od

# Predict wine prices by dataset (https://www.kaggle.com/zynicide/wine-reviews)

## Using RandomForestClassifier, DecisionTreeClassifier, LogisticRegression

In [2]:
od.download("https://www.kaggle.com/zynicide/wine-reviews/download")

Skipping, found downloaded files in "./wine-reviews" (use force=True to force download)


In [3]:
columns = [
    'country', 'points', 'province',
     "price"]


df = pd.read_csv("./wine-reviews/winemag-data-130k-v2.csv",usecols=columns)

In [4]:
df.head()

Unnamed: 0,country,points,price,province
0,Italy,87,,Sicily & Sardinia
1,Portugal,87,15.0,Douro
2,US,87,14.0,Oregon
3,US,87,13.0,Michigan
4,US,87,65.0,Oregon


In [5]:
df = df.fillna(0)

In [6]:
df.head()

Unnamed: 0,country,points,price,province
0,Italy,87,0.0,Sicily & Sardinia
1,Portugal,87,15.0,Douro
2,US,87,14.0,Oregon
3,US,87,13.0,Michigan
4,US,87,65.0,Oregon


In [9]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

In [10]:
categorical_columns

['country', 'province']

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)


In [12]:
def get_rating(row):
    if row.points >= 95:
        return 3
    elif row.points >= 85:
        return 2
    else:
        return 1 

In [13]:
df_full_train['rating'] = df_full_train.apply(get_rating, axis='columns')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full_train['rating'] = df_full_train.apply(get_rating, axis='columns')


In [14]:
df_full_train.head()

Unnamed: 0,country,points,price,province,rating
84161,France,89,30.0,Alsace,2
99116,Portugal,92,108.0,Port,2
61795,France,85,63.0,Rhône Valley,2
62668,Greece,90,25.0,Markopoulo,2
4666,France,89,0.0,Bordeaux,2


In [15]:
corr_df = df_full_train.corr()

In [16]:
corr_df.unstack().sort_values(ascending=False)

points  points    1.000000
price   price     1.000000
rating  rating    1.000000
points  rating    0.638299
rating  points    0.638299
points  price     0.389547
price   points    0.389547
        rating    0.241719
rating  price     0.241719
dtype: float64

In [17]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [18]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [19]:
df_train.head()


Unnamed: 0,country,points,province,rating
0,France,84,Burgundy,1
1,US,86,Oregon,2
2,Spain,89,Northern Spain,2
3,US,94,California,2
4,Portugal,89,Port,2


In [20]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf = RandomForestClassifier(n_estimators=10, random_state=1,n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict_proba(X_val)[:, 1]

In [23]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [24]:
mse = rmse(y_val, y_pred)

In [25]:
mse

52.60437865473959

In [26]:
max_leaf_nodes = [10,100,1000]
for val in max_leaf_nodes:
    dt = DecisionTreeClassifier( max_leaf_nodes=val)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:, 1]
    mse = rmse(y_val, y_pred)
    print(val,mse)

10 52.604379360083165
100 52.604375383040264
1000 52.60438035856001


In [42]:
model = LogisticRegression(solver="liblinear", C=10.0, random_state=42)
common_mean = df_full_train.price.mean()

In [43]:
y_train_binary = (y_train <= common_mean).astype(int)
y_val_binary = (y_val <= common_mean).astype(int)

In [44]:
model.fit(X_train, y_train_binary)

LogisticRegression(C=10.0, random_state=42, solver='liblinear')

In [45]:
y_pred_reg = model.predict_proba(X_val)[:, 1]

In [46]:
price_decision = (y_pred >= 0.5)

In [47]:
common_acc = round((y_val_binary == price_decision).mean(),2)

In [48]:
common_acc

0.35