In [1361]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import re
from datetime import datetime
import math
import locale
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn import metrics
from xgboost import XGBRegressor, DMatrix
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from matplotlib import pyplot as plt 

In [1362]:
import spacy

In [1363]:
pd.set_option("display.max_columns", 100)

In [1364]:
locale.setlocale(locale.LC_ALL, 'de_DE')

'de_DE'

In [1365]:
phones = pd.read_csv('iphones.csv',sep = ";")

Take only listings, that are not active or not sold in 30 days (marked as old)

In [1366]:
phones = phones.loc[(phones['active'] == False) | (phones['old'] == True)]

We don't want to analyse reselers wirth multiple items to sale in one listing

Dropping multiitem listings

In [1367]:
phones = phones.loc[~(phones['multiple_items_available'] == True)]

Shape should stay the same, because they are active and not marked as old

It is critical to have NaN in model field

In [1368]:
phones.loc[(phones['price'].str.contains("£") | phones['price'].str.contains("\$"))].shape

(2440, 26)

We were not interested in listings made from othe countries on other languages, but still there ar some in the dataset. Let's drop them

In [1369]:
phones = phones.loc[~(phones['price'].str.contains("£") | phones['price'].str.contains("\$"))]

Checking left links manually...

In [1370]:
phones.shape

(9392, 26)

In [1371]:
phones.loc[phones['model'].isnull()].iloc[10].loc['link']

'https://www.ebay.de/itm/Apple-IPhone-6s-spacegrau-64-GB-A1688-kein-Simlock-Branding-TOP-48/274665417668?hash=item3ff355abc4%3Ag%3A6uIAAOSwfGhfoYYc&LH_ItemCondition=3000&LH_BIN=1'

I found out, that there are some listings without model. It's possible to extract data from title. Just drop them now

In [1372]:
phones = phones.loc[~phones['model'].isnull()]

In [1373]:
phones.shape

(9110, 26)

Checking for duplicates

In [1374]:
duplicated = phones.loc[phones.duplicated(keep=False)]

In [1375]:
duplicated

Unnamed: 0,_id,active,closure_date,closure_reason,color,condition,creation_date,fianl_shipping_cost,final_price,last_update,link,memory,mobile_operator,model,multiple_items_available,number_of_reviews,old,page_not_found,photos,price,return_policy,search_term,selers_feedback,shipping_cost,text_description,title


Drop nan

In [1376]:
# doesn't change anything phones = phones.dropna(subset = ["memory"])

In [1377]:
phones = phones.loc[~phones['color'].isnull()]

## Analyse closure_reason

In [1378]:
reasons = phones['closure_reason'].unique()

dpor rows with closure statuses that are out of scope

In [1379]:
closure_reasons_out_of_scope = ['Dieses Angebot wurde vom Verkäufer beendet, da der Artikel beschädigt wurde oder verloren ging.', \
                               'Dieses Angebot wurde vom Verkäufer beendet, da der Artikel nicht mehr verfügbar ist.', \
                               'Dieses Angebot wurde vom Verkäufer beendet, da es einen Fehler enthielt.',\
                               'Dieses Angebot wurde beendet. Der Verkäufer hat diesen oder einen gleichartigen Artikel wiedereingestellt.',\
                               'Dieses Angebot wurde vom Verkäufer beendet, da der Artikel verkauft wurde.',\
                               'Das Sofort-Kaufen-Angebot wurde beendet. Der Verkäufer hat diesen oder einen gleichartigen Artikel wiedereingestellt.']

In [1380]:
phones = phones.loc[~phones['closure_reason'].isin(closure_reasons_out_of_scope)]

## PRICE

There is one crazy high price and some prices wirh "/Stk." at the end

In [1381]:
phones['price'] = phones['price'].map(lambda price: float(price.replace('.','').replace('/Stk','').replace("EUR", "").replace(",",".")))

In [1382]:
phones['final_price'] = phones['final_price'].map(lambda price: price if (type(price) == float) else float(price.replace('.','').replace('/Stk','').replace("EUR", "").replace(",",".")))


for items which were not soled and marked as 'old' final price is null. need to fox it

In [1383]:
def set_final_price_for_old_listings(row):
    return row["price"] if (row["old"] == True) else row["final_price"]

In [1384]:
phones["final_price"] = phones.apply(set_final_price_for_old_listings, axis=1)

In [1385]:
phones['title'] = phones['title'].str.lower()

In [1386]:
phones = phones.loc[~phones['title'].str.contains('fortnite')]

remove iphones that costs unrealistic expensive or cheap

In [1387]:
phones = phones[(phones['final_price'] < 1300) & (phones['final_price'] > 10) ]

## Check model

In [1388]:
phones['model'] = phones['model'].str.lower()

remove "apple" at the beginning 

In [1389]:
phones['model'] = phones['model'].map(lambda model: model.replace('apple ',''))

Number of Gb in the title could be valuable feature. I want to keep it, but separate from the model

In [1390]:
phones["title_contains_memory_info"] = phones['model'].map(lambda model: bool(re.search("\dgb", model)))

In [1391]:
phones['model'] = phones['model'].map(lambda model: (re.sub(r"\d+gb", "", model)).strip())

In [1392]:
iphone_se = ["se", "iphone se", "iphone se (1. generation)"]

In [1393]:
iphone_se_2 = ["iphone se (2. generation)", "iphone se 2020", "iphone se 2"]

In [1394]:
iphone_7 = ["iphone 7", "7"]

In [1395]:
iphone_7_plus = ["iphone 7 plus"]

In [1396]:
iphone_6 = ["iphone 6"]

In [1397]:
iphone_6s = ["iphone 6s", "6s"]

In [1398]:
iphone_6s_plus = ["iphone 6s plus"]

In [1399]:
iphone_6_plus = ["iphone 6 plus"]

In [1400]:
iphone_8_plus = ["iphone 8 plus"]

In [1401]:
iphone_8 = ["iphone 8"]

In [1402]:
iphone_11 = ["iphone 11"]

In [1403]:
iphone_11_plus_max = ["iphone 11 pro max"]

In [1404]:
iphone_11_pro =["iphone 11 pro"]

In [1405]:
iphone_xr = ["iphone xr"]

In [1406]:
iphone_x = ["iphone x"]

In [1407]:
iphone_5 = ["iphone 5"]

In [1408]:
iphone_5c = ["iphone 5c"]

In [1409]:
iphone_5s = ["iphone 5s", "5s"]

In [1410]:
iphone_xs = ["iphone xs", "xs"]

In [1411]:
iphone_xs_max = ["iphone xs max", "xs max"]

In [None]:
iphone_12 = ["iphone 12"]

In [None]:
iphone_12_pro_max = ["iphone 12 pro max"]

In [None]:
iphone_12_pro = ["iphone 12 pro"]

In [None]:
iphone_12_mini = ["iphone 12 mini"]

In [None]:
iphone_13 = ["iphone 13"]

In [None]:
iphone_13_pro_max = ["iphone 13 pro max"]

In [None]:
iphone_13_pro = ["iphone 13 pro"]

In [None]:
iphone_13_mini = ["iphone 12 mini"]

In [1412]:
all_models = [iphone_se, iphone_se_2, iphone_7, iphone_7_plus, iphone_6, iphone_6s, iphone_6s_plus, iphone_6_plus, \
              iphone_8_plus, iphone_8, iphone_11_plus_max, iphone_11_pro, iphone_xr, iphone_x, iphone_5, iphone_5c, \
             iphone_5s, iphone_xs, iphone_xs_max, iphone_11, iphone_12, iphone_12_pro_max, iphone_12_pro, iphone_12_mini, \
             iphone_13, iphone_13_pro_max, iphone_13_pro, iphone_13_mini]

In [1413]:
all_models_list = []
for models in all_models:
  all_models_list = all_models_list + models 

I want to exclude models that are out of scope

In [1414]:
exclude_models = [model for model in phones['model'].unique() if model not in all_models_list]

In [1415]:
phones = phones.loc[phones["model"].isin(all_models_list)]

In [1416]:
replace_model = {"6s": "iphone 6s", "7": "iphone 7", "5s": "iphone 5s", "se": "iphone se", 
                 "xs": "iphone xs", "xs max": "iphone xs max", "iphone se 2020": "iphone se 2", 
                 "iphone se (2. generation)": "iphone se 2", "iphone se (1. generation)": "iphone se"}

In [1417]:
phones = phones.replace({"model": replace_model})

In [1418]:
phones['model'] = phones['model'].map(lambda model: model.replace(" ","_").strip())

## Analyse closure_date and creation_date

In [1419]:
phones['closure_date'] = phones['closure_date'].str.extract(r'(^[0-3]\d\.\s[a-zA-Z]{3}\.\s20\d{2})')

In [1420]:
phones['last_update'] = phones['last_update'].str.extract(r'(^[0-3]\d\.\s[a-zA-Z]{3}\.\s20\d{2})')

In [1421]:
phones['creation_date'] = phones['creation_date'].map(lambda model: (re.sub(r"\d{2}\:\d{2}\s", "", model)))

In [1422]:
# For some rasont there is Mrz. instead of Mär on ebay

In [1423]:
phones['creation_date'] = phones['creation_date'].str.replace('Mrz','Mär')

In [1424]:
phones['closure_date'] = phones['closure_date'].str.replace('Mrz','Mär')

In [1425]:
phones['creation_date'] =  pd.to_datetime(phones['creation_date'], format='%d. %b. %Y')

In [1426]:
phones['closure_date'] =  pd.to_datetime(phones['closure_date'], format='%d. %b. %Y')

In [1427]:
phones['last_update'] =  pd.to_datetime(phones['last_update'], format='%d. %b. %Y')

Most likely all they 'out of stock'. Later they become closed by the customer. We may treat them as closed. If we don't have date (it's not displayed, when item is out of stock), we can use last update datae instead. We check data every day. It should be accurate enough


In [1428]:
def set_closure_date(row):
    return row["last_update"] if (str(row["closure_date"]) == "NaT") else row["closure_date"]


In [1429]:
phones["closure_date"] = phones.apply(set_closure_date, axis=1)

Those which don't have last update date just need to be removed (misssing data)

In [1430]:
phones = phones.loc[~(phones["closure_date"].astype(str) == "NaT")]

In [1431]:
phones['listing_was_active_before_closure'] = phones['closure_date'] - phones['creation_date']

In [1432]:
phones['listing_was_active_before_closure'] = phones['listing_was_active_before_closure'].map(lambda date: date.days)

In [1433]:
median = phones['listing_was_active_before_closure'].median()

In [1434]:
median

3.0

## fianl_shipping_cost field

In [1435]:
phones.fianl_shipping_cost = phones.fianl_shipping_cost.fillna(phones.shipping_cost)

what we have except of price in euro:

Checked manually. 'Abholung möglich' looks like an option for ebayplus. Could be replaced with 'KOSTENLOS'

'Kostenlose Abholung' is extreamly rare case. Drop it

In [1436]:
phones = phones.loc[~((phones['fianl_shipping_cost'].isin(['Kostenlose Abholung', 'Standardversand'])) | (phones['shipping_cost'].isin(['Kostenlose Abholung', 'Standardversand'])))]

In [1437]:
phones['fianl_shipping_cost'] = phones['fianl_shipping_cost'].astype(str)
phones['shipping_cost'] = phones['shipping_cost'].astype(str)

In [1438]:
phones['fianl_shipping_cost'] = phones['fianl_shipping_cost'].map(lambda price: price if (price == "nan") else 0 if price in ['KOSTENLOS', 'Abholung möglich'] else \
                                                                  float(price.replace("EUR", "").replace(",",".")))


In [1439]:
phones['shipping_cost'] = phones['shipping_cost'].map(lambda price: price if (price == "nan") else 0 if price in ['KOSTENLOS', 'Abholung möglich'] else \
                                                                  float(price.replace("EUR", "").replace(",",".")))


In [1440]:
phones['fianl_shipping_cost'] = phones['fianl_shipping_cost'].astype(float)

In [1441]:
phones['shipping_cost'] = phones['shipping_cost'].astype(float)

## Memory column

remove whitespaces

In [1442]:
phones['memory'] = phones['memory'].astype(str)


In [1443]:
phones['memory'] = phones['memory'].map(lambda memory: memory.replace(" ","").strip())

In [1444]:
low_frequency_memory_values = [memory for memory in phones['memory'].unique() if phones.loc[phones['memory'] == memory].shape[0] < 20]

In [1445]:
phones = phones.loc[~phones["memory"].isin(low_frequency_memory_values)]

remove nan

## mobile_operator field

In [1446]:
phones['mobile_operator'] = phones['mobile_operator'].str.lower()

In [1447]:
phones['mobile_operator'] = phones['mobile_operator'].astype(str)

In [1448]:
phones['without_mobile_operator'] = phones['mobile_operator'].map(lambda operator : True if (operator == 'nan') else \
                                    True if any(free in operator for free in ["ohne simlock", "alle netze", "frei", "alle", "unlocked"]) \
                                    else False)


## photos field

Analysing inages is out of scope now, but we can get nuber of images as an addtionl field

In [1449]:
phones["number_of_photos"] = phones["photos"].map(lambda photos: len(photos.strip('][').split(', ')))

## return_policy field

In [1450]:
phones['return_policy'] = phones['return_policy'].astype(str)

In [1451]:
low_frequency_return_values = [return_policy for return_policy in phones['return_policy'].unique() if phones.loc[phones['return_policy'] == return_policy].shape[0] < 5]

In [1452]:
phones = phones.loc[~phones["return_policy"].isin(low_frequency_return_values)]

## selers_feedback field

In [1453]:
phones["no_feedback_yet"] = phones["selers_feedback"].isnull()

In [1454]:
phones['selers_feedback'] = phones['selers_feedback'].str.extract(r'(^\d.+\%)')

In [1455]:
phones['selers_feedback'] = phones['selers_feedback'].map(lambda feedback: float(str(feedback).replace("%","").replace(",",".")))

In [1456]:
phones = phones.fillna({"selers_feedback" : 100.0})

## Color field

In [1457]:
phones['color'] = phones['color'].astype(str)

In [1458]:
phones['color'] = phones['color'].str.lower()

In [1459]:
color_dict = {"grey": "grau",
    "white": "weiß",
    "red": "rot",
    "weiss": "weiß",
    "black": "schwarz",
    "und": "",
    "/": "",
    "rose": "rosa",
    "silver": "silber",
    "rosé": "rosa",
    "rosè": "rosa",
    "pink": "rosa",
    "gray": "grau",
    " ": ""
}

In [1460]:
def update_color(color):
    for key in color_dict.keys():
        color = color.replace(key, color_dict[key])
    return color

In [1461]:
phones['color'] = phones['color'].map(lambda color: update_color(color))

In [1462]:
low_frequency_color_values = [return_policy for return_policy in phones['color'].unique() if phones.loc[phones['color'] == return_policy].shape[0] < 10]

In [1463]:
def update_rare_colors(color):
    if (color in low_frequency_color_values):
        color = "other"
    return color

In [1464]:
phones['color'] = phones['color'].map(lambda color: update_rare_colors(color))

## Condition

In [1465]:
positive_labels = ['sehr gut', 'top zustand', 'neuwertigen', 'keine kratzer', 'wie neu', 'micro-kratzer']

In [1466]:
negative_labels = ['akzeptabel', 'starke gebrauchsspuren', 'schade']

In [1467]:
phones['condition']= phones['condition'].str.lower()

In [1468]:
phones['title']= phones['title'].str.lower()

In [1469]:
phones['very_good_condition'] = phones.condition.str.contains('|'.join(positive_labels)) | phones.title.str.contains('|'.join(positive_labels))

In [1470]:
phones['very_bad_condition'] = phones.condition.str.contains('|'.join(negative_labels)) | phones.title.str.contains('|'.join(negative_labels))

In [1471]:
phones["very_good_condition"] = phones["very_good_condition"].astype(int)

In [1472]:
phones["very_bad_condition"] = phones["very_bad_condition"].astype(int)

## Encode categorical features

In [1473]:
phones.loc[phones['listing_was_active_before_closure'].isna()]

Unnamed: 0,_id,active,closure_date,closure_reason,color,condition,creation_date,fianl_shipping_cost,final_price,last_update,link,memory,mobile_operator,model,multiple_items_available,number_of_reviews,old,page_not_found,photos,price,return_policy,search_term,selers_feedback,shipping_cost,text_description,title,title_contains_memory_info,listing_was_active_before_closure,without_mobile_operator,number_of_photos,no_feedback_yet,very_good_condition,very_bad_condition


In [1474]:
phones = phones.reset_index()

In [1475]:
features_for_one_hot_enc = ["model", "color", "memory", 'return_policy']

In [1476]:
for feature in features_for_one_hot_enc:
    phones[feature] = feature + "_" + phones[feature]

In [1477]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(phones.loc[:, features_for_one_hot_enc]);
categories = []
for i in range(0, len(encoder.categories_)):
    categories = np.concatenate((categories, encoder.categories_[i]), axis=None)
for i in range(0, len(categories)):
    phones = pd.concat([phones, pd.DataFrame(encoded[:,i].toarray(), columns = [categories[i]])], axis=1, sort=False)

In [1478]:
model_features = []

In [1479]:
model_features.extend(categories)

## Encode binary values 

In [1480]:
phones['old'].unique() #don't use them. Already added number big value before the sale days

array([nan, True], dtype=object)

In [1481]:
phones["no_feedback_yet"].unique()

array([False,  True])

In [1482]:
phones["no_feedback_yet"] = phones["no_feedback_yet"].astype(int)

In [1483]:
model_features.append('no_feedback_yet')

In [1484]:
model_features.append('very_good_condition')

In [1485]:
model_features.append('very_bad_condition')

## Numerical values

price and final_price: need combine them

In [1486]:
def set_price(row):
    if (math.isnan(row['final_price'])):
        return row['price']
    else:
        return row['final_price']

In [1487]:
phones["price"] = phones.apply(set_price, axis=1)

fianl_shipping_cost and shipping_cost: need combine them

In [1488]:
def set_shipping(row):
    if (math.isnan(row['fianl_shipping_cost'])):
        return row['shipping_cost']
    else:
        return row['fianl_shipping_cost']

In [1489]:
phones["shipping_cost"] = phones.apply(set_shipping, axis=1)

remove reselers

In [1490]:
phones = phones.loc[phones["number_of_reviews"] < 5000]

In [1491]:
model_features.extend(["shipping_cost", "number_of_reviews", "selers_feedback", "listing_was_active_before_closure"])

### MEAN PRICE

In [1492]:
phones["total_price"] = phones["price"] + phones["shipping_cost"]

In [1493]:
model_features.append('price')

## Bug!!! Need to calculate mean_prices only on the Train set. Don't use test set data here!

In [1494]:
mean_prices = phones.groupby(['model', 'memory', 'color'])['total_price'].mean().reset_index()

In [1495]:
def set_mean_price(row):
    price = mean_prices.loc[(mean_prices.model == row.model) & (mean_prices.memory  == row.memory) 
                            & (mean_prices.color == row.color)].total_price
    if (price.shape[0] < 1):
        price = mean_prices.loc[(mean_prices.model == row.model) & (mean_prices.memory  == row.memory)].total_price
    if (price.shape[0] < 1):
        price = mean_prices.loc[(mean_prices.model == row.model)].total_price
    return price.tolist()[0]

In [1496]:
phones["mean_price"] = phones.apply(set_mean_price, axis=1)

In [1497]:
phones["mean_price"].describe()

count    4435.000000
mean      221.597878
std       167.548515
min        16.000000
25%        87.180000
50%       169.829350
75%       319.530814
max       881.900000
Name: mean_price, dtype: float64

In [1498]:
phones["price_difference"] = (phones["total_price"] - phones["mean_price"])/phones["mean_price"]

In [1499]:
phones = phones.loc[phones["price_difference"] < 1.5]

In [1500]:
phones = phones.loc[phones["price_difference"] > -1]

In [1501]:
mean_prices = phones.groupby(['model', 'memory', 'color'])['total_price'].mean().reset_index()

In [1502]:
phones["mean_price"] = phones.apply(set_mean_price, axis=1)

In [1503]:
mean_prices.to_pickle("mean_prices")

## Save median active days

In [1541]:
median_active_days = phones.groupby(['model', 'memory', 'color'])['listing_was_active_before_closure'].agg(['median', 'count']).reset_index()

In [1543]:
median_active_days.to_pickle("median_active_days")

## Preparing features and data format for the model

In [1506]:
normalize = {" ": "_", "ö": "o", "ä": "a", "ß": "ss", "ü": "u"}

In [1507]:
for key in normalize:
    phones.columns = phones.columns.str.replace(key, normalize[key])

In [1508]:
rename_columns = {
                    "return_policy_Verbraucher_konnen_den_Artikel_zu_den_unten_angegebenen_Bedingungen_zuruckgeben_Kaufer_zahlt_Ruckversand": "return_policy_seller_pays",
                    "return_policy_Verbraucher_konnen_den_Artikel_zu_den_unten_angegebenen_Bedingungen_zuruckgeben": "return_policy_buyer_pays",
    "return_policy_Keine_Rucknahme": "return_policy_no_return"}

In [1509]:
phones = phones.rename(rename_columns, axis=1)

In [1510]:
phones_filename = 'phones.pkl'
phones.to_pickle(phones_filename)

In [1511]:
phones = pd.read_pickle(phones_filename)

features

In [1512]:
for key in normalize:
    model_features = [w.replace(key, normalize[key]) for w in model_features]

In [1513]:
for key in rename_columns:
    if key in model_features:
        model_features.remove(key)
        model_features.append(rename_columns[key])

In [1514]:
phones['return_policy'].unique()

array(['return_policy_Keine Rücknahme',
       'return_policy_Verbraucher können den Artikel zu den unten angegebenen Bedingungen zurückgeben'],
      dtype=object)

In [1515]:
model_features

['model_iphone_11',
 'model_iphone_11_pro',
 'model_iphone_11_pro_max',
 'model_iphone_5',
 'model_iphone_5c',
 'model_iphone_5s',
 'model_iphone_6',
 'model_iphone_6_plus',
 'model_iphone_6s',
 'model_iphone_6s_plus',
 'model_iphone_7',
 'model_iphone_7_plus',
 'model_iphone_8',
 'model_iphone_8_plus',
 'model_iphone_se',
 'model_iphone_se_2',
 'model_iphone_x',
 'model_iphone_xr',
 'model_iphone_xs',
 'model_iphone_xs_max',
 'color_blau',
 'color_gelb',
 'color_gold',
 'color_grau',
 'color_grun',
 'color_lila',
 'color_orange',
 'color_other',
 'color_rosa',
 'color_rosagold',
 'color_rot',
 'color_schwarz',
 'color_silber',
 'color_spacegrau',
 'color_weiss',
 'memory_128GB',
 'memory_16GB',
 'memory_256GB',
 'memory_32GB',
 'memory_512GB',
 'memory_64GB',
 'no_feedback_yet',
 'very_good_condition',
 'very_bad_condition',
 'shipping_cost',
 'number_of_reviews',
 'selers_feedback',
 'listing_was_active_before_closure',
 'price',
 'return_policy_buyer_pays',
 'return_policy_no_return

In [1516]:
model_features_filename = 'model_features.pkl'

In [1517]:
pickle.dump(model_features, open(model_features_filename, 'wb'))


In [1518]:
loaded_model_feature = pickle.load(open(model_features_filename, 'rb'))

In [1519]:
model_features = loaded_model_feature

In [1520]:
model_features.append('mean_price')

In [1521]:
model_features

['model_iphone_11',
 'model_iphone_11_pro',
 'model_iphone_11_pro_max',
 'model_iphone_5',
 'model_iphone_5c',
 'model_iphone_5s',
 'model_iphone_6',
 'model_iphone_6_plus',
 'model_iphone_6s',
 'model_iphone_6s_plus',
 'model_iphone_7',
 'model_iphone_7_plus',
 'model_iphone_8',
 'model_iphone_8_plus',
 'model_iphone_se',
 'model_iphone_se_2',
 'model_iphone_x',
 'model_iphone_xr',
 'model_iphone_xs',
 'model_iphone_xs_max',
 'color_blau',
 'color_gelb',
 'color_gold',
 'color_grau',
 'color_grun',
 'color_lila',
 'color_orange',
 'color_other',
 'color_rosa',
 'color_rosagold',
 'color_rot',
 'color_schwarz',
 'color_silber',
 'color_spacegrau',
 'color_weiss',
 'memory_128GB',
 'memory_16GB',
 'memory_256GB',
 'memory_32GB',
 'memory_512GB',
 'memory_64GB',
 'no_feedback_yet',
 'very_good_condition',
 'very_bad_condition',
 'shipping_cost',
 'number_of_reviews',
 'selers_feedback',
 'listing_was_active_before_closure',
 'price',
 'return_policy_buyer_pays',
 'return_policy_no_return

In [1522]:
all_data = phones[["_id"]]

In [1523]:
model_features.append("_id")

In [1524]:
model_features.append("model")

# SPLIT SET

In [1525]:
all_data = pd.merge(all_data, phones[model_features], on=['_id'], how='left')

#TODO check if "return_policy_Verbraucher_konnen_den_Artikel_zu_den_unten_angegebenen_Bedingungen_zurückgeben" really always means that buyer pays

In [1526]:
data_filename = 'all_data.pkl'
all_data.to_pickle(data_filename)

In [1527]:
all_data = pd.read_pickle(data_filename)


In [1528]:
all_data.shape

(4427, 54)

In [1529]:
train, test = train_test_split(all_data, test_size=0.1, random_state=42)

In [1530]:
X_train = train.drop(['_id', 'model', 'price'], axis=1)
X_test = test.drop(['_id', 'model', 'price'], axis=1)

In [1531]:
Y_train = train['price']
Y_test = test['price']

In [1532]:
X_train_filename = 'X_train.pkl'

In [1533]:
X_test_filename = "X_test.pkl"

In [1534]:
Y_train_filename = "Y_train.pkl"

In [1535]:
Y_test_filename = "Y_test.pkl"

In [1536]:
X_train.to_pickle(X_train_filename)

In [1537]:
X_test.to_pickle(X_test_filename)

In [1538]:
Y_train.to_pickle(Y_train_filename)

In [1539]:
Y_test.to_pickle(Y_test_filename)

In [1540]:
X_train.head(1)

Unnamed: 0,model_iphone_11,model_iphone_11_pro,model_iphone_11_pro_max,model_iphone_5,model_iphone_5c,model_iphone_5s,model_iphone_6,model_iphone_6_plus,model_iphone_6s,model_iphone_6s_plus,model_iphone_7,model_iphone_7_plus,model_iphone_8,model_iphone_8_plus,model_iphone_se,model_iphone_se_2,model_iphone_x,model_iphone_xr,model_iphone_xs,model_iphone_xs_max,color_blau,color_gelb,color_gold,color_grau,color_grun,color_lila,color_orange,color_other,color_rosa,color_rosagold,color_rot,color_schwarz,color_silber,color_spacegrau,color_weiss,memory_128GB,memory_16GB,memory_256GB,memory_32GB,memory_512GB,memory_64GB,no_feedback_yet,very_good_condition,very_bad_condition,shipping_cost,number_of_reviews,selers_feedback,listing_was_active_before_closure,return_policy_buyer_pays,return_policy_no_return,mean_price
1126,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0,4.99,1142,100.0,1,0.0,1.0,28.163
