In [143]:
import opendatasets as od # Importing the opendatasets library to download the dataset from Kaggle
import pandas as pd # Importing the pandas library to work with the tabular dataset
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder # importing the necessary classes to process the data
from sklearn.compose import ColumnTransformer # importing the 'ColumnTransformer' class to build a complex transformer
from sklearn.pipeline import Pipeline # import the 'Pipeline' class to build pipeline
from sklearn.linear_model import LogisticRegression # import the 'LogisticRegression' for prediction
import joblib # import the 'joblib' module to save or download the model

In [144]:
# Download the data from Kaggle
od.download(
    "https://www.kaggle.com/datasets/poojakeer/e-commerce-dataset",
     force=True
     )

Dataset URL: https://www.kaggle.com/datasets/poojakeer/e-commerce-dataset


ApiException: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'Content-Type': 'text/html; charset=utf-8', 'Date': 'Fri, 17 May 2024 14:27:23 GMT', 'Cache-Control': 'no-cache,no-store', 'Expires': '-1', 'Pragma': 'no-cache', 'Set-Cookie': 'ka_sessionid=3ce1855b825d5f24e8187d4466345d06; max-age=2626560; path=/, CSRF-TOKEN=CfDJ8B5GsGLMFaFLm6_4BsA80RQ3m_crdVZCIN5qo_I_URO6aglFdQioXw08g2qSVEMyro0c6UlewfwanWdEgPIh_KLTVGKy9sT-UmaoiYFhWA; path=/; secure; samesite=lax; httponly, XSRF-TOKEN=CfDJ8B5GsGLMFaFLm6_4BsA80RSdPxajm7eqOifOGMhz769PZ4fmOr0m2t6Gj6f10CaCmBlmyXagwV6MO6pxoaR8WLF4zltwUxomBC0I9Ic5aAB3JsSzO4dsTdZg19I5RLUdfYr0UysiBXmN7WiqS4Oz-qY; path=/; secure; samesite=lax, CLIENT-TOKEN=eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJpc3MiOiJrYWdnbGUiLCJhdWQiOiJjbGllbnQiLCJzdWIiOiJyb21hbm1lc2hrb3YiLCJuYnQiOiIyMDI0LTA1LTE3VDE0OjI3OjIzLjQxNTY1NzNaIiwiaWF0IjoiMjAyNC0wNS0xN1QxNDoyNzoyMy40MTU2NTczWiIsImp0aSI6ImZjYjc2NWJkLTE4ODAtNGQ2Yi04ZDhjLTBjMDRlMDQ3MmNhYyIsImV4cCI6IjIwMjQtMDYtMTdUMTQ6Mjc6MjMuNDE1NjU3M1oiLCJ1aWQiOjYyMTY3NTMsImRpc3BsYXlOYW1lIjoiSXptYWxraW4gQWxla3NlaSIsImVtYWlsIjoic25kYm94NDRAZ21haWwuY29tIiwidGllciI6Ik5vdmljZSIsInZlcmlmaWVkIjpmYWxzZSwicHJvZmlsZVVybCI6Ii9yb21hbm1lc2hrb3YiLCJ0aHVtYm5haWxVcmwiOiJodHRwczovL3N0b3JhZ2UuZ29vZ2xlYXBpcy5jb20va2FnZ2xlLWF2YXRhcnMvdGh1bWJuYWlscy9kZWZhdWx0LXRodW1iLnBuZyIsImZmIjpbIktlcm5lbHNHaXRodWJTeW5jIiwiSW1wb3J0S2VybmVsc0Zyb21Db2xhYiIsIktlcm5lbHNEcmFmdFVwbG9hZEJsb2IiLCJLZXJuZWxzRXhwb3J0TWV0YWRhdGEiLCJLZXJuZWxzRmlyZWJhc2VMb25nUG9sbGluZyIsIkNvbW11bml0eUxvd2VySGVhZGVyU2l6ZXMiLCJBbGxvd0ZvcnVtQXR0YWNobWVudHMiLCJLZXJuZWxzU2F2ZUNlbGxPdXRwdXQiLCJGcm9udGVuZEVycm9yUmVwb3J0aW5nIiwiRGF0YXNldHNNYW5hZ2VkRm9jdXNPbk9wZW4iLCJDaGFuZ2VEYXRhc2V0T3duZXJzaGlwVG9PcmciLCJFeHBvcnREYXRhc2V0QXNDcm9pc3NhbnQiLCJLTVVzZXJQcm9maWxlIiwiTW9kZWxzQ2FjaGVkVGFnU2VydmljZUVuYWJsZWQiLCJEaXNjdXNzaW9uc1JlYWN0aW9ucyIsIkRhdGFzZXRVcGxvYWRlckR1cGxpY2F0ZURldGVjdGlvbiIsIkRhdGFzZXRzTWV0YWRhdGFTdWdnZXN0aW9ucyIsIk1vZGVsc0Jvb2ttYXJraW5nIiwiQ29tcGV0aXRpb25EaXNjdXNzaW9uc1RlYW1VcFBvcG92ZXIiLCJNb2RlbHNMYW5kaW5nUGFnZUNvbW11bml0eVNlY3Rpb25FbmFibGVkIiwiQ29tcGV0aXRpb25MZWFkZXJib2FyZFRlYW1VcENvbnRhY3RQYW5lbCIsIlBpbm5lZFdvcmsiLCJNZXRhc3RvcmVDaGVja0FnZ3JlZ2F0ZUZpbGVIYXNoZXMiXSwiZmZkIjp7Iktlcm5lbEVkaXRvckF1dG9zYXZlVGhyb3R0bGVNcyI6IjMwMDAwIiwiRW1lcmdlbmN5QWxlcnRCYW5uZXIiOiJ7fSIsIkNsaWVudFJwY1JhdGVMaW1pdFFwcyI6IjQwIiwiQ2xpZW50UnBjUmF0ZUxpbWl0UXBtIjoiNTAwIiwiRmVhdHVyZWRDb21tdW5pdHlDb21wZXRpdGlvbnMiOiI2MDA5NSw1NDAwMCw1NzE2MyIsIkFkZEZlYXR1cmVGbGFnc1RvUGFnZUxvYWRUYWciOiJkaXNhYmxlZCIsIk1vZGVsSWRzQWxsb3dJbmZlcmVuY2UiOiIzMzAxLDM1MzMiLCJNb2RlbEluZmVyZW5jZVBhcmFtZXRlcnMiOiJ7IFwibWF4X3Rva2Vuc1wiOiAxMjgsIFwidGVtcGVyYXR1cmVcIjogMC40LCBcInRvcF9rXCI6IDUgfSIsIlNpbUNvbXBldGl0aW9uSWRzVG9JZ25vcmVVcGxvYWRMaW1pdCI6IjYwMjQzLDYxMjUwLDYxMjQ3IiwiQ29tcGV0aXRpb25NZXRyaWNUaW1lb3V0TWludXRlcyI6IjMwIiwiVGZIdWJLYWdnbGVBbm5vdW5jZW1lbnRVcmwiOiIvZGlzY3Vzc2lvbnMvcHJvZHVjdC1mZWVkYmFjay80NDg0MjUifSwicGlkIjoia2FnZ2xlLTE2MTYwNyIsInN2YyI6IndlYi1mZSIsInNkYWsiOiJBSXphU3lBNGVOcVVkUlJza0pzQ1pXVnotcUw2NTVYYTVKRU1yZUUiLCJibGQiOiI4MTRjMjgwMWI1YTQ4Njg0YWM0YjAwOTFkNThkNjdiNGNmZGMzMTBiIn0.; path=/; secure; samesite=lax, GCLB=CNHB9JLJnZafzAEQAw; path=/; HttpOnly', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'Turbolinks-Location': 'https://www.kaggle.com/error/oops?datasetVersionNumber=None', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-/b8d9KLvMX4uxSXXgs80Nw==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ https://polygraph-cool.github.io https://www.google.com/recaptcha/ https://www.docdroid.com https://www.docdroid.net https://kaggle-static.storage.googleapis.com https://kkb-production.jupyter-proxy.kaggle.net https://kkb-production.firebaseapp.com https://kaggle-metastore.firebaseapp.com https://apis.google.com https://content-sheets.googleapis.com/ https://accounts.google.com/ https://storage.googleapis.com https://docs.google.com https://drive.google.com https://calendar.google.com/;", 'X-Content-Type-Options': 'nosniff', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: b'\r\n\r\n<!DOCTYPE html>\r\n<html lang="en">\r\n\r\n<head>\r\n  <title>Kaggle: Your Home for Data Science</title>\r\n  <meta charset="utf-8" />\r\n    <meta name="robots" content="index, follow" />\r\n  <meta name="description" content="Kaggle is the world&#x2019;s largest data science community with powerful tools and resources to help you achieve your data science goals." />\r\n  <meta name="turbolinks-cache-control" content="no-cache" />\r\n  <meta name="theme-color" content="#008ABC" />\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" type="text/javascript">\r\n    window["pageRequestStartTime"] = 1715956043413;\r\n    window["pageRequestEndTime"] = 1715956043416;\r\n    window["initialPageLoadStartTime"] = new Date().getTime();\r\n  </script>\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" id="gsi-client" src="https://accounts.google.com/gsi/client" async defer></script>\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==">window.KAGGLE_JUPYTERLAB_PATH = "/static/assets/jupyterlab/jupyterlab-index-5f175b831f00674707e1.html";</script>\r\n  <link rel="preconnect" href="https://www.google-analytics.com" crossorigin="anonymous" /><link rel="preconnect" href="https://stats.g.doubleclick.net" /><link rel="preconnect" href="https://storage.googleapis.com" /><link rel="preconnect" href="https://apis.google.com" />\r\n  <link href="/static/images/favicon.ico" rel="shortcut icon" type="image/x-icon" />\r\n  <link rel="manifest" href="/static/json/manifest.json" crossorigin="use-credentials">\r\n\r\n\r\n  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />\r\n\r\n  <link href="https://fonts.googleapis.com/css?family=Inter:400,400i,500,500i,600,600i,700,700i&display=swap"\r\n    rel="preload" as="style" />\r\n  <link href="https://fonts.googleapis.com/css2?family=Google+Symbols:FILL@0..1&display=block"\r\n    rel="preload" as="style" />\r\n  <link href="https://fonts.googleapis.com/css?family=Inter:400,400i,500,500i,600,600i,700,700i&display=swap"\r\n    rel="stylesheet" media="print" id="async-google-font-1" />\r\n  <link href="https://fonts.googleapis.com/css2?family=Google+Symbols:FILL@0..1&display=block"\r\n    rel="stylesheet" media="print" id="async-google-font-2" />\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" type="text/javascript">\r\n    const styleSheetIds = ["async-google-font-1", "async-google-font-2"];\r\n    styleSheetIds.forEach(function (id) {\r\n      document.getElementById(id).addEventListener("load", function() {\r\n        this.media = "all";\r\n      });\r\n    });\r\n  </script>\r\n\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" src="https://www.google.com/recaptcha/enterprise.js?render=6LcW02cpAAAAAJlaJemsQQEwAiTEYB4aR6FYE_rD&waf=session" async defer></script>\r\n   <style>.grecaptcha-badge { visibility: hidden; }</style>\r\n\r\n    <link rel="stylesheet" type="text/css" href="/static/assets/vendor.css?v=dne" />\r\n    <link rel="stylesheet" type="text/css" href="/static/assets/app.css?v=62d595175a876550f3e6" />\r\n\r\n  \r\n    \r\n \r\n      <script nonce="/b8d9KLvMX4uxSXXgs80Nw==">\r\n        try{(function(a,s,y,n,c,h,i,d,e){d=s.createElement("style");\r\n        d.appendChild(s.createTextNode(""));s.head.appendChild(d);d=d.sheet;\r\n        y=y.map(x => d.insertRule(x + "{ opacity: 0 !important }"));\r\n        h.start=1*new Date;h.end=i=function(){y.forEach(x => x<d.cssRules.length ? d.deleteRule(x) : {})};\r\n        (a[n]=a[n]||[]).hide=h;setTimeout(function(){i();h.end=null},c);h.timeout=c;\r\n        })(window,document,[\'.site-header-react__nav\'],\'dataLayer\',2000,{\'GTM-52LNT9S\':true});}catch(ex){}\r\n    </script>\r\n    <script nonce="/b8d9KLvMX4uxSXXgs80Nw==">\r\n        window.dataLayer = window.dataLayer || [];\r\n        function gtag() { dataLayer.push(arguments); }\r\n        gtag(\'js\', new Date());\r\n        gtag(\'config\', \'G-T7QHS60L4Q\', {\r\n            \'optimize_id\': \'GTM-52LNT9S\',\r\n            \'displayFeaturesTask\': null,\r\n            \'send_page_view\': false,\r\n            \'content_group1\': \'Error\'\r\n        });\r\n    </script>\r\n    <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" async src="https://www.googletagmanager.com/gtag/js?id=G-T7QHS60L4Q"></script>\r\n\r\n  \r\n    \r\n\r\n  <meta name="twitter:site" content="@Kaggle" /> \r\n  \r\n    \r\n\r\n  \r\n    \r\n\r\n  \r\n    \r\n\r\n\r\n    <script nonce="/b8d9KLvMX4uxSXXgs80Nw==">window[\'useKaggleAnalytics\'] = true;</script>\r\n\r\n  <script id="gapi-target" nonce="/b8d9KLvMX4uxSXXgs80Nw==" src="https://apis.google.com/js/api.js" defer\r\n    async></script>\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" src="/static/assets/runtime.js?v=9de5c755cff2a1d66693" data-turbolinks-track="reload"></script>\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" src="/static/assets/vendor.js?v=884487708fb1b3b5ce46" data-turbolinks-track="reload"></script>\r\n  <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" src="/static/assets/app.js?v=03eb5374b7f6e9249469" data-turbolinks-track="reload"></script>\r\n    <script nonce="/b8d9KLvMX4uxSXXgs80Nw==" type="text/javascript">\r\n      window.kaggleStackdriverConfig = {\r\n        key: \'AIzaSyA4eNqUdRRskJsCZWVz-qL655Xa5JEMreE\',\r\n        projectId: \'kaggle-161607\',\r\n        service: \'web-fe\',\r\n        version: \'ci\',\r\n        userId: \'6216753\'\r\n      }\r\n    </script>\r\n</head>\r\n\r\n<body data-turbolinks="false">\r\n  <main>\r\n    \r\n\r\n\r\n\r\n\r\n\r\n<div id="site-container"></div>\r\n\r\n<div id="site-body" class="hide">\r\n    \r\n</div>\r\n\r\n\r\n\r\n    <!-- Cheers, web-67d9cc4cd-hllgdp. -->\r\n\r\n  </main>\r\n</body>\r\n\r\n</html>\r\n'


In [None]:
df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe
df.head(3) # Display the first few rows of the dataframe 

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1


In [None]:
df.drop('ID', axis=1, inplace=True) if "ID" in df.columns else None # Drop the 'ID' column from the dataset if exists
df.to_csv('./e-commerce-dataset/Train.csv', index=False) # Save the modified dataset back to the file
df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe
df.info() # Display information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Warehouse_block      10999 non-null  object
 1   Mode_of_Shipment     10999 non-null  object
 2   Customer_care_calls  10999 non-null  int64 
 3   Customer_rating      10999 non-null  int64 
 4   Cost_of_the_Product  10999 non-null  int64 
 5   Prior_purchases      10999 non-null  int64 
 6   Product_importance   10999 non-null  object
 7   Gender               10999 non-null  object
 8   Discount_offered     10999 non-null  int64 
 9   Weight_in_gms        10999 non-null  int64 
 10  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 945.4+ KB


In [None]:
%%writefile test_dataset.py
import pandas as pd # Importing the pandas library to work with the tabular dataset


df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe

def check_column_names(df):
    """
    Check if the 'ID' column is not present in the dataset
    """
    return "ID" not in df.columns

def test_check_column_names():
    """
    Test the 'check_column_names' function
    """
    assert check_column_names(df) == True

def check_duplicates(df):
    """
    Check the number of duplicate rows in the dataset
    """
    return df.duplicated().sum()

def test_check_duplicates():
    """
    Test the check_duplicates function
    """
    assert check_duplicates(df) == 0

def check_missing_values(df):
    """
    Check the number of missing values in the dataset
    """
    return df.isnull().sum().sum()

def test_check_missing_values():
    """
    Test the check_missing_values function
    """
    assert check_missing_values(df) == 0

Overwriting test_dataset.py


In [None]:
df.head(3) # Display the first few rows of the dataframe

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,D,Flight,4,2,177,3,low,F,44,1233,1
1,F,Flight,4,5,216,2,low,M,59,3088,1
2,A,Flight,2,2,183,4,low,M,48,3374,1


In [None]:
# preprocess the data with a ColumnTransformer
transforms = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['Warehouse_block', 'Mode_of_Shipment', 'Gender']),
    ('minmax', MinMaxScaler(), ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']),
    ('ordinal', OrdinalEncoder(categories=[['low', 'medium', 'high']]), ['Product_importance'])
])
# create a pipeline with the ColumnTransformer and a logistic regression model
model = Pipeline([
    ('preprocess', transforms),
    ('logreg', LogisticRegression())
])

In [None]:
X, y = df.drop('Reached.on.Time_Y.N', axis=1), df['Reached.on.Time_Y.N'] # Split the dataset into features and target
model.fit(X, y) # Fit the model on the training data
joblib.dump(model, 'model.pkl') # Save the model to a file

model.predict(X[9:10]) # Make a prediction using the model

array([1])