# Analysis: Store Price

Goal: Run linear regression on all features for target variable store-price.

In [1]:
import os
import pandas as pd
import numpy as np
import sklearn.model_selection as ms

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
%load_ext autoreload
%autoreload 2         # reload custom py file eveytime a new cell is run

#import helper 

## Prepare Target & Features

In [4]:
FULL_FILE = 'data/brickset_features.csv'
full_df = pd.read_csv(FULL_FILE)
full_df.head()
full_df.info()

Unnamed: 0,set_no,name,price_store,price_used,popularity,group,year,total_parts,number_parts,number_colors,scolor_1st,scolor_2nd
0,1591-1,Danone Delivery Truck,,28.0,,Modern day,1980.0,40,19,4,White,Blue
1,1592-1,Town Square - Castle Scene,,235.0,19.2,Modern day,1980.0,495,133,10,Black,Yellow
2,1651-2,Maersk Line Container Lorry,,444.0,5.0,Modern day,1980.0,305,66,9,Blue,Gray
3,6305-1,Trees and Flowers,3.75,10.0,17.2,Modern day,1980.0,24,6,4,Green,Red
4,6306-1,Road Signs,2.5,13.0,12.0,Modern day,1980.0,12,10,1,White,White


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3213 entries, 0 to 3212
Data columns (total 12 columns):
set_no           3213 non-null object
name             3213 non-null object
price_store      2762 non-null float64
price_used       3140 non-null float64
popularity       3001 non-null float64
group            3213 non-null object
year             3213 non-null float64
total_parts      3213 non-null int64
number_parts     3213 non-null int64
number_colors    3213 non-null int64
scolor_1st       3213 non-null object
scolor_2nd       3213 non-null object
dtypes: float64(4), int64(3), object(5)
memory usage: 301.3+ KB


In [5]:
# filter out sets without store-price

full_df = full_df.loc[full_df['price_store'].isna()==False]
full_df.shape

(2762, 12)

In [6]:
# target series

target_sr = full_df['price_store']
target_sr.shape

(2762,)

In [7]:
# feature dataframe with one-hot encoding

feature_ls = ['group', 'year', 'total_parts', 'number_parts', 'number_colors', 'scolor_1st', 'scolor_2nd']
feature_cat_df = full_df[feature_ls]

feature_df = pd.get_dummies(feature_cat_df, 
               columns = ['group', 'scolor_1st', 'scolor_2nd'], 
               drop_first= True)
feature_df.shape

(2762, 27)

## Split Out Test

In [8]:
# not able to stratify the subsets 

stratify_sr = target_sr.value_counts()
stratify_sr = stratify_sr[stratify_sr > 1]
stratify_sr.head()
stratify_sr.shape

19.99    210
29.99    165
9.99     145
39.99    142
49.99    140
Name: price_store, dtype: int64

(129,)

In [11]:
X_train, X_test, y_train, y_test = ms.train_test_split(target_sr, feature_cat_df, test_size=0.2, 
                                                       random_state=1)
X_train.shape
X_test.shape
y_train.shape
y_test.shape 

(2209,)

(553,)

(2209, 7)

(553, 7)