# Single Model (Decision Tree) with Label Encoding

### Training

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_log_error,mean_squared_error
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline 

In [4]:
train = pd.read_csv('data download/train.tsv',sep = '\t')
test = pd.read_csv('data download/test.tsv',sep = '\t')

In [5]:
X_data = train.drop(['price','train_id','item_description'],axis = 1)
y_data = train['price']
X_data['category_name'] = X_data['category_name'].fillna(value='not_decided')
X_data['brand_name'] = X_data['brand_name'].fillna(value='unknown')

In [6]:
cat_label = LabelEncoder().fit_transform(X_data['category_name'])
brand_label = LabelEncoder().fit_transform(X_data['brand_name'])

In [7]:
X_data['cat_label'] = cat_label
X_data['brand_label'] = brand_label
X_data = X_data.drop(['name','category_name','brand_name'],axis = 1)

In [9]:
X_train,X_valid,y_train,y_valid = train_test_split(X_data,y_data,test_size = 0.2,random_state = 2)

In [12]:
dt = DecisionTreeRegressor(min_samples_leaf=9).fit(X_train,y_train)
y_pred = dt.predict(X_valid)
train_score = np.sqrt(mean_squared_log_error(y_valid,y_pred))
print('Train score: {}'.format(train_score))

Train score: 0.5951873136254393


In [13]:
y_pred_train = dt.predict(X_train)
valid_score = np.sqrt(mean_squared_log_error(y_train,y_pred_train))
print('Valid score: {}'.format(valid_score))

Valid score: 0.5794676821235564


### Submission

In [14]:
X_train = train.drop(['train_id','price','item_description'],axis = 1)
y_train = train['price']
X_test = test.drop(['test_id','item_description'],axis = 1)

In [16]:
X_all = pd.concat([X_train,X_test],axis = 0)
X_all['category_name'] = X_all['category_name'].fillna(value='not_decided')
X_all['brand_name'] = X_all['brand_name'].fillna(value='unknown')

In [17]:
cat_label = LabelEncoder().fit_transform(X_all['category_name'])
brand_label = LabelEncoder().fit_transform(X_all['brand_name'])
X_all['cat_label'] = cat_label
X_all['brand_label'] = brand_label
X_all = X_all.drop(['name','category_name','brand_name'],axis = 1)

In [19]:
X_train_labeled = X_all.iloc[:X_train.shape[0],:]
X_test_labeled = X_all.iloc[X_train.shape[0]:,:]

In [20]:
dt = DecisionTreeRegressor(min_samples_leaf=9).fit(X_train_labeled,y_train)
preds = dt.predict(X_test_labeled)
sub = test[['test_id']]
sub['price'] = preds
sub.to_csv('submission.csv',index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
