In [1]:
# Load libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split # Import train_test_split function
import numpy as np
import gensim
from scipy.sparse import hstack
import utils


w2v_model = gensim.models.word2vec.Word2Vec.load("data/updated_google_corpus")

In [2]:
TESTING_MODE = False
data = utils.load_data("./data/train.csv",is_train= TESTING_MODE)
print("Loaded {0} examples with {1} columns".format(data.shape[0],data.shape[1]))

Loaded 258210 examples with 14 columns


In [3]:
data.describe()

Unnamed: 0,points,price,id
count,258210.0,175000.0,258210.0
mean,88.084734,34.3044,61312.957329
std,3.157554,38.398146,39102.177584
min,79.636128,4.0,0.0
25%,85.96939,16.0,28728.0
50%,87.981722,25.0,57518.5
75%,90.088168,40.0,88859.75
max,100.220603,2500.0,150929.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 258210 entries, 0 to 83209
Data columns (total 14 columns):
country                  258146 non-null object
description              258210 non-null object
designation              181120 non-null object
points                   258210 non-null float64
price                    175000 non-null float64
province                 258146 non-null object
region_1                 215793 non-null object
region_2                 110996 non-null object
taster_name              96479 non-null object
taster_twitter_handle    91559 non-null object
title                    120975 non-null object
variety                  258209 non-null object
winery                   258210 non-null object
id                       258210 non-null int64
dtypes: float64(2), int64(1), object(11)
memory usage: 29.5+ MB


In [5]:
data = utils.remove_nan(data)

In [6]:
data.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20,Alentejano,missing,missing,missing,missing,missing,PORTUGUESE RED,J. Portugal Ramos,32027
1,France,"A solid, chunky wine, with a structure that is...",missing,88.041695,28,Bordeaux,Lalande de Pomerol,missing,missing,missing,missing,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier,71079
2,France,"This is powerful and concentrated, with the hi...",missing,94.085021,130,Bordeaux,Saint-Émilion,missing,missing,missing,missing,BORDEAUX-STYLE RED BLEND,Château Figeac,32440
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34,California,Santa Barbara County,Central Coast,missing,missing,Jaffurs 2010 Thompson Vineyard Petite Sirah (S...,PETITE SIRAH,Jaffurs,124405
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24,Washington,Horse Heaven Hills,Columbia Valley,Sean P. Sullivan,@wawinereport,Syncline 2016 McKinley Springs Vineyard Rosé (...,ROSé,Syncline,33649


In [7]:
one_hot_encoding = utils.one_hot_encoding(data,["country","region_2","province","taster_name","variety"])

In [8]:
transforms = utils.get_count_vectorizer(data,  ["title","region_1","designation"])

In [9]:
description_vecs = utils.text_to_embeddings(data["description"], w2v_model=w2v_model)

In [10]:
X = hstack([description_vecs,one_hot_encoding,data[["points"]]]).tocsr()
y = data["price"]
if not TESTING_MODE:
    X_test = X[175000:]
    y_test = y[175000:]
    X = X[:175000]
    y = y[:175000]

In [11]:
# convert to <if True> if you want to search for the parameters
if True:
    params = {
        "n_estimators" : [5, 10, 15, 20, 25, 30, 50, 75, 100],
        "max_depth" : [2, 5, 10, 15, 20, 25, 30, 50, 75, 100],
        "min_samples_split" : np.linspace(0.1, 1.0, 5, endpoint=True),
        "min_samples_leaf" : np.linspace(0.1, .5, 5, endpoint=True),
        "max_features" : np.linspace(0.1, 1.0, 5, endpoint=True),
    }

    rfr = RandomForestRegressor(n_jobs=8)
    grid_search = utils.train_grid_search(rfr, X, y, params)
    print(grid_search.best_estimator_)

In [None]:
%%time
rfr = RandomForestRegressor()
rfr, scores = utils.train_kfold(rfr, X, y)

In [None]:
if not TESTING_MODE:
    ids = data["id"][175000:]
    y_pred = lr.predict(X_test)
    # removing outliers
    y_pred[y_pred>2500] = data["price"].mean()
    y_pred[y_pred<0]=data["price"].mean()
    result = pd.DataFrame()
    result['id'] = ids
    result["price"] = y_pred
    result.to_csv("data/out_lr.csv",index=False)