# Precleaning/Processing

In [25]:
import pandas as pd
import numpy as np


### Precleaning/Processing Orders.csv

In [26]:
df_orders = pd.read_csv("orders.csv", delimiter = ';' )
df_orders

Unnamed: 0,id,created_at,user_id,product_id,discount,quantity,subtotal,tax,total
0,1,2019-02-11T21:40:27.892Z,1,14,0.00000,2,37.6481,2.07,39.7181
1,2,2018-05-15T08:04:04.580Z,1,123,0.00000,3,110.9310,6.10,117.0380
2,3,2019-12-06T22:22:48.544Z,1,105,6.41668,2,52.7235,2.90,55.6221
3,4,2019-08-22T16:30:42.392Z,1,94,0.00000,6,109.2190,6.01,115.2210
4,5,2018-10-10T03:34:47.309Z,1,132,0.00000,5,127.8820,7.03,134.9420
...,...,...,...,...,...,...,...,...,...
18755,18756,2018-09-07T16:05:20.186Z,2499,165,0.00000,4,38.3045,2.49,79.5597
18756,18757,2018-12-13T08:54:11.721Z,2499,131,0.00000,2,113.1170,7.35,75.1032
18757,18758,2018-08-10T11:18:20.444Z,2499,127,0.00000,7,134.4800,8.74,82.0753
18758,18759,2019-12-25T09:40:29.152Z,2500,45,0.00000,2,118.0500,7.08,103.5760


In [27]:
#Checking missing value
df_orders.isnull().sum()

id            0
created_at    0
user_id       0
product_id    0
discount      0
quantity      0
subtotal      0
tax           0
total         0
dtype: int64

In [28]:
#Dropping subtotal because of inconsistency with price, unneccessary columns
df_ordersnew = df_orders.drop(["subtotal","tax","total","discount"],axis= 1)
df_ordersnew


df_ordersnew["date_only"] = pd.to_datetime(df_ordersnew["created_at"].str[:10])
df_ordersnew

Unnamed: 0,id,created_at,user_id,product_id,quantity,date_only
0,1,2019-02-11T21:40:27.892Z,1,14,2,2019-02-11
1,2,2018-05-15T08:04:04.580Z,1,123,3,2018-05-15
2,3,2019-12-06T22:22:48.544Z,1,105,2,2019-12-06
3,4,2019-08-22T16:30:42.392Z,1,94,6,2019-08-22
4,5,2018-10-10T03:34:47.309Z,1,132,5,2018-10-10
...,...,...,...,...,...,...
18755,18756,2018-09-07T16:05:20.186Z,2499,165,4,2018-09-07
18756,18757,2018-12-13T08:54:11.721Z,2499,131,2,2018-12-13
18757,18758,2018-08-10T11:18:20.444Z,2499,127,7,2018-08-10
18758,18759,2019-12-25T09:40:29.152Z,2500,45,2,2019-12-25


In [29]:
df_ordersnew.isnull().sum()

id            0
created_at    0
user_id       0
product_id    0
quantity      0
date_only     0
dtype: int64

### Precleaning/Processing products.csv

In [30]:
df_products = pd.read_csv("products.csv", delimiter = ';' )

In [31]:
df_productsnew = df_products.drop(["ean","quantity","created_at"], axis= 1)
df_productsnew


Unnamed: 0,id,category,price,rating,title,vendor
0,1,Gizmo,29.4633,4.6,Rustic Paper Wallet,"Swaniawski, Casper and Hilll"
1,2,Doohickey,70.0799,0.0,Small Marble Shoes,Balistreri-Ankunding
2,3,Doohickey,35.3887,4.0,Synergistic Granite Chair,"Murray, Watsica and Wunsch"
3,4,Doohickey,73.9918,3.0,Enormous Aluminum Shirt,Regan Bradtke and Sons
4,5,Gadget,82.7451,4.0,Enormous Marble Wallet,"Price, Schultz and Daniel"
...,...,...,...,...,...,...
195,196,Widget,46.7641,0.0,Heavy-Duty Linen Toucan,Balistreri-Muller
196,197,Gizmo,46.7641,4.6,Aerodynamic Concrete Lamp,Erika Volkman Group
197,198,Gizmo,46.7641,4.1,Enormous Copper Shirt,"Considine, Schamberger and Schiller"
198,199,Widget,76.9533,3.6,Mediocre Leather Coat,"Gulgowski, Grimes and Mayer"


### Precleaning/Processing reviews.csv

In [32]:
df_reviews = pd.read_csv("reviews.csv", delimiter = ';' )

In [33]:
df_reviewsnew = df_reviews.drop(["created_at"],axis= 1)
df_reviewsnew

Unnamed: 0,id,reviewer,product_id,rating,body
0,1,christ,1,5,Ad perspiciatis quis et consectetur. Laboriosa...
1,2,xavier,1,4,Reprehenderit non error architecto consequatur...
2,3,cameron.nitzsche,1,5,In aut numquam labore fuga. Et tempora sit et ...
3,4,barbara-shields,1,4,Est accusamus provident non animi labore minus...
4,5,clement,1,5,Id sed sint corrupti molestias ad alias aut in...
...,...,...,...,...,...
1107,1108,dandre,200,4,Quia natus eaque odit aperiam quasi ea. Fugit ...
1108,1109,will,200,3,Alias exercitationem quo aut rem accusamus quo...
1109,1110,cecilia.hyatt,200,4,Et doloribus quo laudantium earum accusamus pa...
1110,1111,delbert,200,5,Eligendi id adipisci quis quaerat est dolorem ...


### Precleaning/Processing users.csv

In [34]:
df_users = pd.read_csv("users.csv", delimiter = ';' ,encoding='latin-1')
df_users

Unnamed: 0,id,created_at,name,email,address,city,state,zip,birth_date,latitude,longitude,password,source
0,1,2017-10-07T01:34:35.462Z,Hudson Borer,borer-hudson@yahoo.com,9611-9809 West Rosedale Road,Wood River,NE,68883,1986-12-12,40.7132,-98.5260,ccca881f-3e4b-4e5c-8336-354103604af6,Twitter
1,2,2018-04-09T12:10:05.167Z,Domenica Williamson,williamson-domenica@yahoo.com,101 4th Street,Searsboro,IA,50242,1967-06-10,41.5813,-92.6991,eafc45bf-cf8e-4c96-ab35-ce44d0021597,Affiliate
2,3,2017-06-27T06:06:20.625Z,Lina Heaney,lina.heaney@yahoo.com,29494 Anderson Drive,Sandstone,MN,55072,1961-12-18,46.1197,-92.8416,36f67891-34e5-4439-a8a4-2d9246775ff8,Facebook
3,4,2019-02-21T13:59:15.348Z,Arnold Adams,adams.arnold@gmail.com,2-7900 Cuerno Verde Road,Rye,CO,81069,1992-08-12,37.9203,-104.9730,537a727b-7525-44a3-99c8-8fdc488fbf02,Google
4,5,2017-09-05T03:36:44.811Z,Dominique Leffler,leffler.dominique@hotmail.com,761 Fish Hill Road,Beaver Dams,NY,14812,1974-04-20,42.3490,-77.0567,6a802b6c-4da8-4881-9ca6-4f69085c7c14,Twitter
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,2496,2018-11-26T04:53:41.010Z,Jettie Yost,jettie-yost@gmail.com,22000-22998 County Road G,Yuma,CO,80759,1988-05-15,39.9009,-102.6890,4f3de414-f0a7-4cf1-80b6-dad18f5cf1be,Affiliate
2496,2497,2017-01-05T02:58:59.217Z,Sadye Gibson,sadye.gibson@gmail.com,23402 Camp Arrowhead Road,Lewes,DE,19958,1986-06-12,38.6579,-75.1427,b147382c-29d2-4801-a606-3d1d8f840082,Organic
2497,2498,2016-08-31T11:49:59.777Z,Verner Hamill,verner.hamill@gmail.com,186 4500 South,Milford,UT,84751,1962-07-29,38.3279,-113.0120,45afa7ae-0df9-42dc-ae22-6ea667085ef4,Affiliate
2498,2499,2017-03-24T09:27:51.686Z,Cloyd Beer,cloyd-beer@gmail.com,15708 East Lincoln Road,Spokane,WA,99217,1964-08-13,47.7267,-117.1940,0a25bd21-90ac-461d-903f-8ecf6dfc4af1,Twitter


In [35]:
#Dropping unneccasary columns
df_usersnew = df_users.drop(["password","address"],axis = 1)
df_usersnew.head()

Unnamed: 0,id,created_at,name,email,city,state,zip,birth_date,latitude,longitude,source
0,1,2017-10-07T01:34:35.462Z,Hudson Borer,borer-hudson@yahoo.com,Wood River,NE,68883,1986-12-12,40.7132,-98.526,Twitter
1,2,2018-04-09T12:10:05.167Z,Domenica Williamson,williamson-domenica@yahoo.com,Searsboro,IA,50242,1967-06-10,41.5813,-92.6991,Affiliate
2,3,2017-06-27T06:06:20.625Z,Lina Heaney,lina.heaney@yahoo.com,Sandstone,MN,55072,1961-12-18,46.1197,-92.8416,Facebook
3,4,2019-02-21T13:59:15.348Z,Arnold Adams,adams.arnold@gmail.com,Rye,CO,81069,1992-08-12,37.9203,-104.973,Google
4,5,2017-09-05T03:36:44.811Z,Dominique Leffler,leffler.dominique@hotmail.com,Beaver Dams,NY,14812,1974-04-20,42.349,-77.0567,Twitter


In [36]:
from datetime import datetime

def extract_date_and_duration(df, column_name="created_at"):
    """
    Extracts the date (first 10 characters) from a given datetime column 
    and calculates the duration (in days) from that date until today.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column_name (str): The name of the column containing datetime strings.

    Returns:
    pd.DataFrame: The updated DataFrame with 'date_only' and 'days_since' columns.
    """
    df = df.copy()  # Avoid modifying the original DataFrame
    df["date_only"] = df[column_name].str[:10]  # Extract date part
    df["date_only"] = pd.to_datetime(df["date_only"])  # Convert to datetime format

    today = pd.to_datetime(datetime.today().date())  # Get today's date
    df["dayduration"] = (today - df["date_only"]).dt.days  # Calculate duration
    

    return df

In [37]:
dfuserstemp = extract_date_and_duration(df_usersnew)
df_usersclean = dfuserstemp.drop(["created_at","date_only"], axis= 1)
df_usersclean

Unnamed: 0,id,name,email,city,state,zip,birth_date,latitude,longitude,source,dayduration
0,1,Hudson Borer,borer-hudson@yahoo.com,Wood River,NE,68883,1986-12-12,40.7132,-98.5260,Twitter,2681
1,2,Domenica Williamson,williamson-domenica@yahoo.com,Searsboro,IA,50242,1967-06-10,41.5813,-92.6991,Affiliate,2497
2,3,Lina Heaney,lina.heaney@yahoo.com,Sandstone,MN,55072,1961-12-18,46.1197,-92.8416,Facebook,2783
3,4,Arnold Adams,adams.arnold@gmail.com,Rye,CO,81069,1992-08-12,37.9203,-104.9730,Google,2179
4,5,Dominique Leffler,leffler.dominique@hotmail.com,Beaver Dams,NY,14812,1974-04-20,42.3490,-77.0567,Twitter,2713
...,...,...,...,...,...,...,...,...,...,...,...
2495,2496,Jettie Yost,jettie-yost@gmail.com,Yuma,CO,80759,1988-05-15,39.9009,-102.6890,Affiliate,2266
2496,2497,Sadye Gibson,sadye.gibson@gmail.com,Lewes,DE,19958,1986-06-12,38.6579,-75.1427,Organic,2956
2497,2498,Verner Hamill,verner.hamill@gmail.com,Milford,UT,84751,1962-07-29,38.3279,-113.0120,Affiliate,3083
2498,2499,Cloyd Beer,cloyd-beer@gmail.com,Spokane,WA,99217,1964-08-13,47.7267,-117.1940,Twitter,2878


# Initial Feature Engineering

### Orders feature engineering

In [38]:
orderstemp = pd.merge(df_ordersnew, df_productsnew, how="inner", left_on="product_id", right_on="id")
orderstemp = orderstemp.drop(["id_y","rating","title", "vendor"], axis = 1)
orderstemp["total"] = orderstemp["price"] * orderstemp["quantity"]
orderstemp.rename(columns={"id_x": "id"})
orderstemp

Unnamed: 0,id_x,created_at,user_id,product_id,quantity,date_only,category,price,total
0,1,2019-02-11T21:40:27.892Z,1,14,2,2019-02-11,Widget,25.0988,50.1976
1,2,2018-05-15T08:04:04.580Z,1,123,3,2018-05-15,Gizmo,73.9543,221.8629
2,3,2019-12-06T22:22:48.544Z,1,105,2,2019-12-06,Gadget,35.1490,70.2980
3,4,2019-08-22T16:30:42.392Z,1,94,6,2019-08-22,Widget,72.8124,436.8744
4,5,2018-10-10T03:34:47.309Z,1,132,5,2018-10-10,Widget,85.2546,426.2730
...,...,...,...,...,...,...,...,...,...
18755,18756,2018-09-07T16:05:20.186Z,2499,165,4,2018-09-07,Gizmo,25.5363,102.1452
18756,18757,2018-12-13T08:54:11.721Z,2499,131,2,2018-12-13,Widget,75.4115,150.8230
18757,18758,2018-08-10T11:18:20.444Z,2499,127,7,2018-08-10,Widget,89.6534,627.5738
18758,18759,2019-12-25T09:40:29.152Z,2500,45,2,2019-12-25,Doohickey,78.6997,157.3994


### Reviews Feature Engineering

In [39]:
df_reviewsnew["review_length"] = df_reviewsnew["body"].str.len()
df_reviewsnew

Unnamed: 0,id,reviewer,product_id,rating,body,review_length
0,1,christ,1,5,Ad perspiciatis quis et consectetur. Laboriosa...,171
1,2,xavier,1,4,Reprehenderit non error architecto consequatur...,237
2,3,cameron.nitzsche,1,5,In aut numquam labore fuga. Et tempora sit et ...,202
3,4,barbara-shields,1,4,Est accusamus provident non animi labore minus...,158
4,5,clement,1,5,Id sed sint corrupti molestias ad alias aut in...,138
...,...,...,...,...,...,...
1107,1108,dandre,200,4,Quia natus eaque odit aperiam quasi ea. Fugit ...,212
1108,1109,will,200,3,Alias exercitationem quo aut rem accusamus quo...,172
1109,1110,cecilia.hyatt,200,4,Et doloribus quo laudantium earum accusamus pa...,286
1110,1111,delbert,200,5,Eligendi id adipisci quis quaerat est dolorem ...,235


### Users Feature Engineering

In [40]:
df_usersclean["age"] = 2024 - df_usersclean["birth_date"].str[:4].astype(int)
df_usersclean.head()




Unnamed: 0,id,name,email,city,state,zip,birth_date,latitude,longitude,source,dayduration,age
0,1,Hudson Borer,borer-hudson@yahoo.com,Wood River,NE,68883,1986-12-12,40.7132,-98.526,Twitter,2681,38
1,2,Domenica Williamson,williamson-domenica@yahoo.com,Searsboro,IA,50242,1967-06-10,41.5813,-92.6991,Affiliate,2497,57
2,3,Lina Heaney,lina.heaney@yahoo.com,Sandstone,MN,55072,1961-12-18,46.1197,-92.8416,Facebook,2783,63
3,4,Arnold Adams,adams.arnold@gmail.com,Rye,CO,81069,1992-08-12,37.9203,-104.973,Google,2179,32
4,5,Dominique Leffler,leffler.dominique@hotmail.com,Beaver Dams,NY,14812,1974-04-20,42.349,-77.0567,Twitter,2713,50


##

# Loading Data

In [41]:
orders_clean = orderstemp.copy()
products_clean = df_productsnew.copy()
reviews_clean = df_reviewsnew.copy()
users_clean = df_usersclean.copy()

# Save DataFrames to CSV files
orders_clean.to_csv("orders_clean.csv", index=False)
products_clean.to_csv("products_clean.csv", index=False)
reviews_clean.to_csv("reviews_clean.csv", index=False)
users_clean.to_csv("users_clean.csv", index=False)