In [None]:
from sqlalchemy import create_engine
import pandas as pd
import yaml
import numpy as np
import sklearn
import os
from os.path import isfile
from pandas_profiling import ProfileReport

In [None]:
fb_df = pd.read_json("/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/products_table.json")

In [None]:
fb_df = fb_df['category'].replace('N/A', np.nan)
fb_df = fb_df.dropna()

In [None]:
def remove_n_a_rows(df, column: str):
    # Swap N/A for the pandas nan, so we can drop them
    temp_df = df[column].replace('N/A', np.nan)
    temp_df = temp_df.dropna()
    # Create a new df with only the records without the nans
    clean_df = pd.merge(temp_df, df,
                            left_index=True, right_index=True)
    # The merge creates a duplicate column. Remove it.
    clean_df.drop(column + '_x', inplace=True, axis=1)
    # Rename the remaining category column
    clean_df.rename(columns={column + '_y': column}, inplace=True)
    # Commit the cleansed data to the dataframe
    df = clean_df
    return df

fb_df = remove_n_a_rows(fb_df, 'category')

In [None]:
fb_df.head()

In [None]:
prof = ProfileReport(fb_df)
prof.to_file(output_file='output.html')

In [None]:
fb_df['price'] = fb_df['price'].apply(
            lambda x: x.strip("£").replace(',',''))
fb_df['price'] = fb_df['price'].astype('float64')
fb_df.reset_index(drop=True)

In [None]:
fb_df = fb_df[fb_df['price'] > 1]
fb_df = fb_df[fb_df['price'] < 10000]

In [None]:
fb_df.dtypes

In [None]:
#split categories into main using lambda
fb_df['main_category'] = fb_df['category'].apply(
    lambda x: x.split("/")[0].strip())

In [None]:
#remove special characters from colums
fb_df['product_name'] = fb_df['product_name'].str.lower().replace('[^0-9a-zA-Z]+', '_', regex=True)
#category_encodings = pd.get_dummies(
 #           fb_df['product_name'], prefix='product_name', drop_first=True)
#fb_df = pd.concat(
 #           [fb_df, category_encodings], axis=1)
fb_df['product_description'] = fb_df['product_description'].str.lower().replace('[^0-9a-zA-Z]+', '_', regex=True)
#category_encodings = pd.get_dummies(
 #           fb_df['product_description'], prefix='product_description', drop_first=True)
#fb_df = pd.concat(
  #          [fb_df, category_encodings], axis=1)
 #           fb_df['main_category'], prefix='main_category', drop_first=True)
fb_df['location'] = fb_df['location'].str.lower().replace('[^0-9a-zA-Z]+', '_', regex=True)
#category_encodings = pd.get_dummies(
 #           fb_df['location'], prefix='location', drop_first=True)
#fb_df = pd.concat(
 #           [fb_df, category_encodings], axis=1)


In [None]:
fb_df

In [None]:
columns=["product_name","product_description","location"]
fb_df.drop_duplicates(subset=columns, keep="first")

In [None]:
save_path = "/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/cleaned_tabular_new.json"
clean_data = fb_df
clean_data = clean_data.to_json(save_path)

In [None]:
import seaborn as sns
sns.boxplot(x='main_category', y='price', data=fb_df)

In [None]:
X = new_data[['main_category']]
y = new_data['price']

X = pd.get_dummies(X, drop_first=True)
X

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
# The coefficients
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
        

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1
)
X_train

In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred)

In [None]:
#result with product_name
#Coefficients: 
# [-114.6611036 -184.6611036 -184.4611036 ... -125.6611036 -160.6611036
# -165.6611036]
#Mean squared error: 312604.47
#Coefficient of determination: 0.02

#result with product_description
#Coefficients: 
# [-174.32112853 -176.32112853 -136.32112853 ... -168.32112853 -181.32112853
# -126.32112853]
#Mean squared error: 313237.48
#Coefficient of determination: 0.02

#result with location
#Coefficients: 
# [1.21784284e+14 1.21784284e+14 1.21784284e+14 ... 1.21784284e+14
# 1.21784284e+14 1.21784284e+14]
#Mean squared error: 1706705503531225766334300160.00
#Coefficient of determination: -5323258916010414047232.00

#result with main_category
#Coefficients: 
# [-1.32240253e+14 -1.32240253e+14 -1.32240253e+14 -1.32240253e+14
# -1.32240253e+14 -1.32240253e+14 -1.32240253e+14 -1.32240253e+14
# -1.32240253e+14 -1.32240253e+14 -1.32240253e+14 -1.32240253e+14
# -1.32240253e+14]
#Mean squared error: 307299.58
#Coefficient of determination: 0.04

In [None]:
fb_df

In [None]:
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geocoder = Nominatim(user_agent='GetLoc')
geocode = RateLimiter(geocoder.geocode, min_delay_seconds=0.3, return_value_on_exception=None)
#location = geocode.reverse(, timeout=10, language='en')
fb_df['geo_location'] = fb_df['location'].apply(geocode)
fb_df['longiude_latitude'] = fb_df['geo_location'].apply(
    lambda loc: tuple(loc.point) if loc else None)


In [None]:
save_path = "data/cleaned_tabular_new.json"
clean_data = fb_df
clean_data = clean_data.to_json(save_path)

In [None]:
image_df = pd.read_csv('/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/Images.csv')

In [None]:
new_data = pd.read_json('/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/cleaned_tabular_new.json')

In [None]:
merge_df = image_df.merge(new_data, how='inner', left_on='product_id', right_on='id')

In [None]:
merge_df

In [None]:
save_dir = "/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/new_merge_df.csv"
merge_data = merge_df
merge_data = clean_data.to_csv(save_dir, axis=1)

In [None]:
def export_df_to_csv(df):
    """
    Export the dataframe to a csv file.

    Args:
        df (pandas.DataFrame): The dataframe to export.
    """
    df.to_csv('/home/shah/Desktop/FB-Marketplace-Recommendation-Ranking-System/data/new_merge_df.csv', index=False)

export_df_to_csv(merge_df)  