In [1]:
import opendatasets as od # Importing the opendatasets library to download the dataset from Kaggle
import pandas as pd # Importing the pandas library to work with the tabular dataset
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder # importing the necessary classes to process the data
from sklearn.compose import ColumnTransformer # importing the 'ColumnTransformer' class to build a complex transformer
from sklearn.pipeline import Pipeline # import the 'Pipeline' class to build pipeline
from sklearn.linear_model import LogisticRegression # import the 'LogisticRegression' for prediction
import joblib # import the 'joblib' module to save or download the model

In [2]:
# Download the data from Kaggle
od.download(
    "https://www.kaggle.com/datasets/poojakeer/e-commerce-dataset",
     force=True
     )

Dataset URL: https://www.kaggle.com/datasets/poojakeer/e-commerce-dataset
Downloading e-commerce-dataset.zip to ./e-commerce-dataset


100%|██████████| 121k/121k [00:00<00:00, 306kB/s]







In [3]:
df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe
df.head(3) # Display the first few rows of the dataframe 

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1


In [4]:
df.drop('ID', axis=1, inplace=True) if "ID" in df.columns else None # Drop the 'ID' column from the dataset if exists
df.to_csv('./e-commerce-dataset/Train.csv', index=False) # Save the modified dataset back to the file
df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe
df.info() # Display information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Warehouse_block      10999 non-null  object
 1   Mode_of_Shipment     10999 non-null  object
 2   Customer_care_calls  10999 non-null  int64 
 3   Customer_rating      10999 non-null  int64 
 4   Cost_of_the_Product  10999 non-null  int64 
 5   Prior_purchases      10999 non-null  int64 
 6   Product_importance   10999 non-null  object
 7   Gender               10999 non-null  object
 8   Discount_offered     10999 non-null  int64 
 9   Weight_in_gms        10999 non-null  int64 
 10  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(7), object(4)
memory usage: 945.4+ KB


In [5]:
%%writefile test_dataset.py
import pandas as pd # Importing the pandas library to work with the tabular dataset


df = pd.read_csv('./e-commerce-dataset/Train.csv') # Load the dataset into a pandas dataframe

def check_data_types(df):
    """
    Check the column data type
    """
    return df.dtypes.apply(lambda x: x.name).to_dict()

def test_numeric_columns():
    """
    Testing dataframe columns for MinMaxScaler
    """
    columns_list = ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']
    columns_dict = check_data_types(df)
    assert all(columns_dict[column] == 'int64' for column in columns_list)
    
def check_column_names(df):
    """
    Check if the 'ID' column is not present in the dataset
    """
    return "ID" not in df.columns

def test_check_column_names():
    """
    Test the 'check_column_names' function
    """
    assert check_column_names(df) == True

def check_duplicates(df):
    """
    Check the number of duplicate rows in the dataset
    """
    return df.duplicated().sum()

def test_check_duplicates():
    """
    Test the check_duplicates function
    """
    assert check_duplicates(df) == 0

def check_missing_values(df):
    """
    Check the number of missing values in the dataset
    """
    return df.isnull().sum().sum()

def test_check_missing_values():
    """
    Test the check_missing_values function
    """
    assert check_missing_values(df) == 0

Overwriting test_dataset.py


In [6]:
df.head(3) # Display the first few rows of the dataframe

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,D,Flight,4,2,177,3,low,F,44,1233,1
1,F,Flight,4,5,216,2,low,M,59,3088,1
2,A,Flight,2,2,183,4,low,M,48,3374,1


In [7]:
# preprocess the data with a ColumnTransformer
transforms = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first'), ['Warehouse_block', 'Mode_of_Shipment', 'Gender']),
    ('minmax', MinMaxScaler(), ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']),
    ('ordinal', OrdinalEncoder(categories=[['low', 'medium', 'high']]), ['Product_importance'])
])
# create a pipeline with the ColumnTransformer and a logistic regression model
model = Pipeline([
    ('preprocess', transforms),
    ('logreg', LogisticRegression())
])

In [8]:
X, y = df.drop('Reached.on.Time_Y.N', axis=1), df['Reached.on.Time_Y.N'] # Split the dataset into features and target
model.fit(X, y) # Fit the model on the training data
joblib.dump(model, 'model.pkl') # Save the model to a file

model.predict(X[9:10]) # Make a prediction using the model

array([1])