In [1]:
from sqlalchemy import create_engine
import pandas as pd
import yaml
import numpy as np
import sklearn
import os
from os.path import isfile

In [2]:
fb_df = pd.read_json("data/products_table.json")

In [None]:
fb_df.head()
#8091 rows and 8 columns

In [None]:
fb_df.info()

In [None]:
fb_df['product_name'].describe()

In [None]:
fb_df.columns

In [None]:
import seaborn as sns
sns.pairplot(fb_df)

In [3]:
def remove_n_a_records(df, column: str):
    """
    Scan the column for records with all N/As. Get rid of them

    Args:
        column (str): The column currently being scanned.
    """
    # Swap N/A for the pandas nan, so we can drop them
    temp_df = df[column].replace('N/A', np.nan)
    temp_df = temp_df.dropna()
    # Create a new df with only the records without the nans
    clean_df = pd.merge(temp_df, df,
                        left_index=True, right_index=True)
    # The merge creates a duplicate column. Remove it.
    clean_df.drop(column + '_x', inplace=True, axis=1)
    # Rename the remaining category column
    clean_df.rename(columns={column + '_y': column}, inplace=True)
    # Commit the cleansed data to the dataframe
    df = clean_df
    return df

fb_df = remove_n_a_records(fb_df, 'category')

In [None]:
fb_df

In [None]:
#seperate numerical columns and select numerical columns
df_numeric = fb_df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values

In [None]:
#select non-numeric columns
df_non_numeric = fb_df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values

In [4]:
fb_df['price'] = fb_df['price'].apply(
            lambda x: x.strip("£").replace(',',''))
fb_df['price'] = fb_df['price'].astype('float64')

In [None]:
#generate hierarchy from product_name, product_description and location
fb_df

In [None]:
fb_df['price'].head(10)

In [None]:
fb_df

In [None]:
#check for outliers
fb_df['price'].describe()

In [None]:
fb_df['price'].plot(kind='box', figsize=(12, 8))

In [5]:
def remove_price_outliers(df):
    
    df = df[df['price'] < 10000]
    df = df[df['price'] > 0.1]
    return df

fb_df = remove_price_outliers(fb_df)

In [None]:
fb_df['price'].plot(kind='box', figsize=(12, 8))

In [None]:
fb_df.describe()

In [6]:
#split categories into main using lambda
fb_df['main_category'] = fb_df['category'].apply(
    lambda x: x.split("/")[0].strip())
fb_df['sub_category'] = fb_df['category'].apply(
    lambda x: x.split("/")[1].strip())
#fb_df['mini_category'] = fb_df['category'].apply(
 #   lambda x: x.split("/")[2].strip())


In [None]:
fb_df

In [None]:
import seaborn as sns

In [None]:
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geocoder = Nominatim(user_agent='GetLoc')
geocode = RateLimiter(geocoder.geocode, min_delay_seconds=0.3, return_value_on_exception=None)
#location = geocode.reverse(, timeout=10, language='en')
fb_df['geo_location'] = fb_df['location'].apply(geocode)
fb_df['longiude_latitude'] = fb_df['geo_location'].apply(
    lambda loc: tuple(loc.point) if loc else None)



In [None]:
fb_df

In [7]:
#clean random characters from columns
fb_df['product_name'] = fb_df['product_name'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['product_name'], prefix='product_name', drop_first=True)
fb_df['product_description'] = fb_df['product_description'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['product_description'], prefix='product_description', drop_first=True)
fb_df['category'] = fb_df['category'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['category'], prefix='category', drop_first=True)
fb_df['main_category'] = fb_df['main_category'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['main_category'], prefix='main_category', drop_first=True)
fb_df['location'] = fb_df['location'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['location'], prefix='location', drop_first=True)
fb_df['sub_category'] = fb_df['sub_category'].str.lower().replace('\W+', '_', regex=True)
category_encodings = pd.get_dummies(
            fb_df['sub_category'], prefix='sub_category', drop_first=True)

#fb_df = pd.concat(
            #[fb_df, category_encodings], axis=1)


In [8]:
fb_df

Unnamed: 0,id,product_name,category,product_description,price,location,page_id,create_time,main_category,sub_category
1,243809c0-9cfc-4486-ad12-3b7a16605ba9,mirror_wall_art_in_wokingham_berkshire_gumtree,home_garden_dining_living_room_furniture_mirro...,mirror_wall_art_posted_by_nisha_in_dining_livi...,5.0,wokingham_berkshire,1426704584,2022-02-26,home_garden,dining_living_room_furniture
2,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,stainless_steel_food_steamer_in_inverness_high...,home_garden_other_household_goods,morphy_richard_s_model_no_48755_stainless_stee...,20.0,inverness_highland,1426704579,2022-02-26,home_garden,other_household_goods
3,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,sun_loungers_in_skegness_lincolnshire_gumtree,home_garden_garden_patio_outdoor_settings_furn...,i_have_2_of_these_collection_only_as_i_don_t_d...,20.0,skegness_lincolnshire,1426704576,2022-02-26,home_garden,garden_patio
4,59948726-29be-4b35-ade5-bb2fd7331856,coffee_side_table_from_ammunition_ammo_box_hai...,home_garden_dining_living_room_furniture_other,great_reclaimed_army_ammunition_box_used_as_co...,115.0,radstock_somerset,1426704575,2022-02-26,home_garden,dining_living_room_furniture
5,16dbc860-696e-4cda-93f6-4dd4926573fb,modern_shannon_sofa_for_sale_at_low_cost_in_de...,home_garden_dining_living_room_furniture_sofas...,new_design_shannon_corner_sofa_5_seater_availa...,450.0,delph_manchester,1426704570,2022-02-26,home_garden,dining_living_room_furniture
...,...,...,...,...,...,...,...,...,...,...
8085,c4148656-78a9-4f3e-b393-134fdc5ef900,sony_playstation_vr_move_bundle_in_acocks_gree...,video_games_consoles_consoles_ps4_sony_playsta...,sony_playstation_vr_move_bundle353cash_on_coll...,260.0,acocks_green_west_midlands,1422159237,2022-02-28,video_games_consoles,consoles
8086,564e3411-768d-4250-a624-b119d696f103,playstation_vr_v2_bundle_in_acocks_green_west_...,video_games_consoles_consoles_ps4_sony_playsta...,playstation_vr_v2_bundle355cash_on_collection_...,235.0,acocks_green_west_midlands,1422159464,2022-02-28,video_games_consoles,consoles
8088,2b0a652b-46a2-4297-b619-5efeeb222787,oculus_quest_2_256gb_in_montrose_angus_gumtree,video_games_consoles_other_video_games_consoles,pick_up_only_250comes_with_two_pistols_stocks_...,250.0,montrose_angus,1426668818,2022-02-28,video_games_consoles,other_video_games_consoles
8089,719fd40a-870e-4144-b324-55dff2e66fb4,logitech_driving_force_shifter_in_carrickfergu...,video_games_consoles_video_game_accessories_ot...,bought_at_christmas_from_currys_retailing_at_4...,30.0,carrickfergus_county_antrim,1426699715,2022-02-28,video_games_consoles,video_game_accessories


In [None]:
fb_df.columns

In [None]:
sns.heatmap(fb_df.corr(), annot=True)

In [None]:
#remove duplicates
columns = ['product_name', 'product_description', 'location', 'category']
fb_df.drop_duplicates(subset=columns, keep="first", )

In [9]:
#tested with product_name, product_description, location, category, main_category
X = fb_df['main_category']
y = fb_df['price']

X

1                home_garden
2                home_garden
3                home_garden
4                home_garden
5                home_garden
                ...         
8085    video_games_consoles
8086    video_games_consoles
8088    video_games_consoles
8089    video_games_consoles
8090    video_games_consoles
Name: main_category, Length: 7011, dtype: object

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1
)
X_train

3039              diy_tools_materials
7922             video_games_consoles
6593                    health_beauty
3340              diy_tools_materials
7085       office_furniture_equipment
                    ...              
1055          music_films_books_games
5852    phones_mobile_phones_telecoms
4443          music_films_books_games
271                       home_garden
5796    phones_mobile_phones_telecoms
Name: main_category, Length: 4907, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(stop_words= 'english').fit(X_train)
df_train = pd.DataFrame(cvec.transform(X_train).todense(),
                            columns=cvec.get_feature_names_out())

In [12]:
df_test = pd.DataFrame(cvec.transform(X_test).todense(),
                            columns=cvec.get_feature_names_out())
print(df_train.shape)
print(y_train.shape)
print(df_test.shape)
print(y_test.shape)

(4907, 13)
(4907,)
(2104, 13)
(2104,)


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lm = LinearRegression()
lm.fit(df_train, y_train)
lm.score(df_test, y_test)
#y_pred = lm.predict(X_test)

0.028655975940928613

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lm = LinearRegression()
lm.fit(df_train, y_train)
lm.score(df_test, y_test)
y_pred = lm.predict(df_test)

# The coefficients
print("Coefficients: \n", lm.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [1.45204921e+14 1.45204921e+14 1.45204921e+14 1.45204921e+14
 1.45204921e+14 1.45204921e+14 1.45204921e+14 1.45204921e+14
 1.45204921e+14 1.45204921e+14 1.45204921e+14 1.45204921e+14
 1.45204921e+14]
Mean squared error: 330223.95
Coefficient of determination: 0.03


In [None]:
fb_df

In [None]:
#export cleaned data to csv file
fb_df.to_csv('data/clened_data.csv', index=False)

In [None]:
#initialise 
#create dictionary of categories
#create dataframe of image paths
#numberies cats
#prepare image cat datapoint
