In [9]:
import pandas as pd
import plotly.express as px

pd.options.mode.chained_assignment = None  # this turns off insignificant warning when processing dataset

# loading standardized data
data = pd.read_csv('../data/recruitment_data_standardized.csv',
                   encoding='utf-8',
                   sep=',',
                   on_bad_lines='skip',
                   quotechar='"',
                   doublequote=True,
                   names=['URL', 'Voivodeship', 'Scrap_time', 'Name', 'Price', 'Brand', 'Condition', 'Offer_from',
                          'Type', 'Description', 'Added_at', 'Views', 'User_since'],
                   skiprows=1)

In [16]:
data

Unnamed: 0,URL,Voivodeship,Scrap_time,Name,Price,Brand,Condition,Offer_from,Type,Description,Added_at,Views,User_since
0,https://www.olx.pl/oferta/iphone-11-64-jak-now...,pomorskie,2021-02-22 06:55:30,Iphone 11 64 jak nowy 95% gwarancja wyświetlacz,2799.0,iPhone,Używane,Osoby prywatnej,Sprawny,Jak nowy . Kondycja baterii 95%. Kupiony w med...,2021-02-22 00:09:00,37,2013-05-01 00:00:00;;;;
1,https://www.olx.pl/oferta/skup-uszkodzonych-te...,pomorskie,2021-02-22 06:55:34,Skup uszkodzonych telefonów iPhone xs xs max 1...,,,,Firmy,,Witam. Kupię uszkodzone/ zablokowane/ zalane/...,2021-02-22 00:05:00,5242,2020-04-01 00:00:00;;;;
2,https://www.olx.pl/oferta/iphone-11-64-gb-czar...,pomorskie,2021-02-22 06:55:40,"IPhone 11 64 GB czarny, idealny z gwarancją. W...",2700.0,iPhone,Używane,Osoby prywatnej,Sprawny,Witam! Mam na sprzedaż iPhone’a 11 w wersji 64...,2021-02-21 19:00:00,186,2014-12-01 00:00:00;;;;
3,https://www.olx.pl/oferta/iphone-11-CID99-IDIk...,pomorskie,2021-02-22 06:55:44,Iphone 11,3000.0,iPhone,Nowe,Osoby prywatnej,Sprawny,Nowy 128GB Oryginalnie zapakowany kolor czar...,2021-02-21 18:24:00,250,2016-06-01 00:00:00;;;;
4,https://www.olx.pl/oferta/jak-nowy-apple-iphon...,pomorskie,2021-02-22 06:55:52,Jak Nowy Apple Iphone 11 256gbGB White Gwarancja,2899.0,iPhone,Używane,Firmy,Sprawny,Witaj. Jesteśmy sklepem - serwisem z 12 le...,2021-02-21 17:38:00,845,2012-08-01 00:00:00;;;;
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4979,https://www.olx.pl/oferta/iphone-11-red-64-gb-...,podkarpackie,2021-02-22 05:54:53,"IPhone 11 Red, 64 GB",2990.0,iPhone,Nowe,Osoby prywatnej,Sprawny,"Witam mam na sprzedaż nowego IPhona 11 64 GB, ...",2021-02-21 12:16:00,177,2018-06-01 00:00:00;;;;
4980,https://www.olx.pl/oferta/iphone-11-white-64gb...,podkarpackie,2021-02-22 05:55:26,Iphone 11 white 64gb,1650.0,iPhone,Używane,Osoby prywatnej,Sprawny,Na sprzedaż posiadam iphone 11 white o pojemno...,2021-02-20 13:42:00,980,2021-02-01 00:00:00;;;;
4981,https://www.olx.pl/oferta/etui-iphone-11-CID99...,podlaskie,2021-02-22 06:28:04,Etui iPhone 11,100.0,,Używane,Osoby prywatnej,,"Witam, mam na sprzedaż dwa oryginalne etui od ...",2021-02-22 04:51:00,78,2016-06-01 00:00:00;;;;
4982,https://www.olx.pl/oferta/iphone-11-purple-fio...,podlaskie,2021-02-22 06:28:11,IPhone 11 purple fioletowy nowy 64 GB,2680.0,iPhone,Nowe,Osoby prywatnej,Sprawny,Dzień Dobry. Posiadam na sprzedaż iphone 11 pu...,2021-02-21 19:21:00,72,2017-02-01 00:00:00;;;;


In [5]:
px.bar(pd.DataFrame(data.isna().sum())).update_layout(xaxis_title='Feature name',
                                                      yaxis_title='Number of NaNs',
                                                      title='Bar plot for amount of NaNs in dataset',
                                                      title_x=0.5,
                                                      showlegend=False)

In [97]:
px.pie(data, names='Condition', title='Percentage of offers in different conditions').update_layout(title_x=0.5)

In [98]:
px.pie(data, names='Voivodeship', title='Percentage of offers from different voivodeships').update_layout(title_x=0.5)

In [91]:
# Operations on dataset shown below are described in detail in notebook 03_data_preprocessing.ipynb
data['Name'] = data.loc[:, 'Name'].str.lower()
data['Description'] = data.loc[:, 'Description'].str.lower()
data['Concatenated_description'] = data['Name'] + ' ' + data['Description']
data_dropped_price_nans = data.dropna(subset=['Price']).reset_index(drop=True)
data_dropped_price_nans['Date'] = pd.to_datetime(data_dropped_price_nans['Added_at']).dt.date
data_reduced_dims = data_dropped_price_nans.query("Condition == 'Używane' & Type == 'Sprawny' & Brand == 'iPhone'")
data_reduced_dims = data_reduced_dims[data_reduced_dims.Price > 1000]
data_reduced_dims = data_reduced_dims.drop(
    columns=['Voivodeship', 'Scrap_time', 'Views', 'User_since', 'Added_at', 'URL', 'Brand', 'Condition', 'Offer_from',
             'Type'])
data_concatenated = data_reduced_dims.drop(columns=['Name', 'Description']).reset_index(drop=True)

In [47]:
data_concatenated

Unnamed: 0,Price,Concatenated_description,Date
0,2799.0,iphone 11 64 jak nowy 95% gwarancja wyświetlac...,2021-02-22
1,2700.0,"iphone 11 64 gb czarny, idealny z gwarancją. w...",2021-02-21
2,2899.0,jak nowy apple iphone 11 256gbgb white gwaranc...,2021-02-21
3,2500.0,apple iphone 11 biały 64gb - jak nowy gwarancj...,2021-02-21
4,2150.0,"iphone 11 64 gb + gwarancja witam, mam na sprz...",2021-02-21
...,...,...,...
2714,2299.0,iphone 11 black 64gb sprzedam iphone 11 64 gb ...,2021-02-21
2715,1900.0,i phone 11 64 gb cena tylko dzis cena tylko dz...,2021-02-21
2716,2800.0,"iphone 11 128 gb gwarancja , 100% bateria spr...",2021-02-21
2717,1650.0,iphone 11 white 64gb na sprzedaż posiadam ipho...,2021-02-20


In [49]:
data_concatenated['Concatenated_description'].str.contains('iphone 11').sum() # method for filtering data

2562

In [52]:
data_concatenated['Concatenated_description'].str.contains('iphone 11 pro max').sum() # method for filtering data

264

In [92]:
# adding filtering column based on phrase present in name + description column
data_concatenated['Phone model'] = None
for sample in range(len(data_concatenated)):
    if 'iphone 11 pro max' in data_concatenated['Concatenated_description'][sample]:
        data_concatenated['Phone model'][sample] = 'iphone 11 pro max'
    elif 'iphone 11 pro' in data_concatenated['Concatenated_description'][sample]:
        data_concatenated['Phone model'][sample] = 'iphone 11 pro'
    elif 'iphone 11' in data_concatenated['Concatenated_description'][sample]:
        data_concatenated['Phone model'][sample] = 'iphone 11'

In [93]:
# dropping unused column
data_grouped = data_concatenated.drop(columns='Concatenated_description').reset_index(drop=True)

In [114]:
px.box(data_grouped, x='Phone model', y='Price',
       title='Average prices of different iPhone 11 models with standard deviation').update_layout(title_x=0.5)

In [126]:
data_grouped.groupby(['Date', 'Phone model'])['Price'].mean()

Date        Phone model  
2021-01-01  iphone 11        2495.642857
2021-01-02  iphone 11        2583.194444
2021-01-03  iphone 11        2577.861364
2021-01-04  iphone 11        2617.810811
            iphone 11 pro    3466.333333
                                ...     
2021-02-21  iphone 11 pro    2450.000000
2021-02-22  iphone 11        2497.848485
2021-02-23  iphone 11        2474.170732
2021-02-24  iphone 11        2435.025641
2021-02-25  iphone 11        2212.250000
Name: Price, Length: 114, dtype: float64

In [119]:
# grouping data by phone model
groups = dict(list(data_grouped.groupby(['Phone model'])['Price', 'Date']))


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [120]:
groups.keys()

dict_keys(['iphone 11', 'iphone 11 pro', 'iphone 11 pro max'])

In [121]:
# data is correctly grouped into models with date and price columns
groups['iphone 11']

Unnamed: 0,Price,Date
0,2799.0,2021-02-22
1,2700.0,2021-02-21
2,2899.0,2021-02-21
3,2500.0,2021-02-21
4,2150.0,2021-02-21
...,...,...
2666,2700.0,2021-02-21
2667,2299.0,2021-02-21
2669,2800.0,2021-02-21
2670,1650.0,2021-02-20


In [123]:
px.box(groups['iphone 11'], x='Date', y='Price',
       title='Average prices of iPhone 11 base model with standard deviation over time').update_layout(title_x=0.5)

In [127]:
px.box(groups['iphone 11 pro'], x='Date', y='Price',
       title='Average prices of iPhone 11 Pro with standard deviation over time', range_x=['2021-01-01', '2021-02-24']).update_layout(title_x=0.5)

In [128]:
px.box(groups['iphone 11 pro max'], x='Date', y='Price',
       title='Average prices of iPhone 11 Pro Max with standard deviation over time', range_x=['2021-01-01', '2021-02-24']).update_layout(title_x=0.5)

### Simple analysis with data exploration, filtering and grouping methods shows that most probably there is no visible relation between price of iPhones and progressing time series for all 3 submodels. This could occur, because there were too few samples in the dataset (especially for Pro and Pro Max models) or perhaps such a relation does not exist in real life. This is a very important insight to have in mind going into NLP model building.