## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_auc_score
from IPython.display import FileLink




## Loading the CSV file

In [2]:
raw="E_Commerce.csv"
df=pd.read_csv(raw)

## Data Preprocessing

In [3]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [5]:
df.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64

In [6]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
10994    False
10995    False
10996    False
10997    False
10998    False
Length: 10999, dtype: bool

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df = df.drop(columns=['ID'])

## Feautre Engineering

In [9]:
df['Weight_Discount'] = df['Weight_in_gms'] * df['Discount_offered']
df['High_Discount'] = (df['Discount_offered'] > 20).astype(int)

df['Customer_Issue'] = (
    (df['Customer_care_calls'] >= 4) &
    (df['Customer_rating'] <= 2)
).astype(int)

df['Weight_Category'] = pd.cut(
    df['Weight_in_gms'],
    bins=[0, 2000, 4000, 6000, 10000],
    labels=['Light', 'Medium', 'Heavy', 'Very_Heavy']
)

## One Hot Encoding

In [10]:
categorical_cols = [
    'Warehouse_block',
    'Mode_of_Shipment',
    'Product_importance',
    'Gender',
    'Weight_Category'
]

categorical_cols = [c for c in categorical_cols if c in df.columns]

df = pd.get_dummies(
    df,
    columns=categorical_cols,
    drop_first=True
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Customer_care_calls         10999 non-null  int64
 1   Customer_rating             10999 non-null  int64
 2   Cost_of_the_Product         10999 non-null  int64
 3   Prior_purchases             10999 non-null  int64
 4   Discount_offered            10999 non-null  int64
 5   Weight_in_gms               10999 non-null  int64
 6   Reached.on.Time_Y.N         10999 non-null  int64
 7   Weight_Discount             10999 non-null  int64
 8   High_Discount               10999 non-null  int64
 9   Customer_Issue              10999 non-null  int64
 10  Warehouse_block_B           10999 non-null  bool 
 11  Warehouse_block_C           10999 non-null  bool 
 12  Warehouse_block_D           10999 non-null  bool 
 13  Warehouse_block_F           10999 non-null  bool 
 14  Mode_o

## Define X and Y

In [12]:
X =df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms','Warehouse_block_B', 'Warehouse_block_C',
       'Warehouse_block_D', 'Warehouse_block_F', 'Mode_of_Shipment_Road',
       'Mode_of_Shipment_Ship', 'Product_importance_low',
       'Product_importance_medium', 'Gender_M','Weight_Discount']]     
y = df['Reached.on.Time_Y.N']

## Train-Test-Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

## Scaling

In [14]:
numeric_cols = [
    'Customer_care_calls',
    'Customer_rating',
    'Cost_of_the_Product',
    'Prior_purchases',
    'Discount_offered',
    'Weight_in_gms',
    'Weight_Discount'
]

scaler = StandardScaler()

X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

## Logistic Regression Model

In [15]:
model = LogisticRegression(max_iter=1000,class_weight='balanced')
model.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",'balanced'
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


## Probability Prediction

In [16]:
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

## Evaluation

In [17]:
print(confusion_matrix(y_test, y_pred))

[[ 866  465]
 [ 706 1263]]


In [18]:
classification_rep=classification_report(y_test,y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.55      0.65      0.60      1331
           1       0.73      0.64      0.68      1969

    accuracy                           0.65      3300
   macro avg       0.64      0.65      0.64      3300
weighted avg       0.66      0.65      0.65      3300



In [19]:
FileLink("E_Commerce_PowerBI.csv")

In [20]:
import os
os.getcwd()

'C:\\Users\\oanka'

In [21]:
os.listdir()

['.anaconda',
 '.bundle',
 '.cache',
 '.cisco',
 '.conda',
 '.condarc',
 '.continuum',
 '.docker',
 '.dotnet',
 '.expo',
 '.idlerc',
 '.insomniac',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.local',
 '.m2',
 '.matplotlib',
 '.ms-ad',
 '.spyder-py3',
 '.streamlit',
 '.templateengine',
 '.virtual_documents',
 '.vscode',
 '.vscode-print-resource-cache',
 '3D Objects',
 'anaconda3',
 'anaconda_projects',
 'ansel',
 'AppData',
 'Application Data',
 'ARIMA.ipynb',
 'Audio-Text.ipynb',
 'breast_cancer_data.csv',
 'CapStone_Decision_Tree.ipynb',
 'CapStone_LogisticRegression.ipynb',
 'Capstone_Project_EDA.ipynb',
 'Capstone_Project_KNN.ipynb',
 'Capstone_Project_RandomForest.ipynb',
 'cars_data-1-1.csv',
 'Contacts',
 'Cookies',
 'Create Visualizations using Matplotlib, Seaborn & Folium.ipynb',
 'customer_segmentation.csv',
 'data',
 'Data_Science_1.ipynb',
 'Data_Science_2.ipynb',
 'Desktop',
 'Documents',
 'Downloads',
 'Dropbox',
 'ecommerce_sales_analysis.ipynb',
 'Excercise_Pytho

In [22]:
df.head()


Unnamed: 0,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N,Weight_Discount,High_Discount,Customer_Issue,...,Warehouse_block_D,Warehouse_block_F,Mode_of_Shipment_Road,Mode_of_Shipment_Ship,Product_importance_low,Product_importance_medium,Gender_M,Weight_Category_Medium,Weight_Category_Heavy,Weight_Category_Very_Heavy
0,4,2,177,3,44,1233,1,54252,1,1,...,True,False,False,False,True,False,False,False,False,False
1,4,5,216,2,59,3088,1,182192,1,0,...,False,True,False,False,True,False,True,True,False,False
2,2,2,183,4,48,3374,1,161952,1,0,...,False,False,False,False,True,False,True,True,False,False
3,3,3,176,4,10,1177,1,11770,0,0,...,False,False,False,False,False,True,True,False,False,False
4,2,2,184,3,46,2484,1,114264,1,0,...,False,False,False,False,False,True,False,True,False,False


In [23]:
df.to_csv("E_Commerce_PowerBI.csv", index=False)


In [24]:
os.listdir()

['.anaconda',
 '.bundle',
 '.cache',
 '.cisco',
 '.conda',
 '.condarc',
 '.continuum',
 '.docker',
 '.dotnet',
 '.expo',
 '.idlerc',
 '.insomniac',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.local',
 '.m2',
 '.matplotlib',
 '.ms-ad',
 '.spyder-py3',
 '.streamlit',
 '.templateengine',
 '.virtual_documents',
 '.vscode',
 '.vscode-print-resource-cache',
 '3D Objects',
 'anaconda3',
 'anaconda_projects',
 'ansel',
 'AppData',
 'Application Data',
 'ARIMA.ipynb',
 'Audio-Text.ipynb',
 'breast_cancer_data.csv',
 'CapStone_Decision_Tree.ipynb',
 'CapStone_LogisticRegression.ipynb',
 'Capstone_Project_EDA.ipynb',
 'Capstone_Project_KNN.ipynb',
 'Capstone_Project_RandomForest.ipynb',
 'cars_data-1-1.csv',
 'Contacts',
 'Cookies',
 'Create Visualizations using Matplotlib, Seaborn & Folium.ipynb',
 'customer_segmentation.csv',
 'data',
 'Data_Science_1.ipynb',
 'Data_Science_2.ipynb',
 'Desktop',
 'Documents',
 'Downloads',
 'Dropbox',
 'ecommerce_sales_analysis.ipynb',
 'Excercise_Pytho

In [25]:
FileLink("E_Commerce_PowerBI.csv")