In [None]:
import pandas as pd

In [10]:
file_path = 'retail_sales_dataset.csv'
data = pd.read_csv(file_path)


data = data.drop(columns=['Transaction ID', 'Date'])


In [11]:
data.head()

Unnamed: 0,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,CUST001,Male,34,Beauty,3,50,150
1,CUST002,Female,26,Clothing,2,500,1000
2,CUST003,Male,50,Electronics,1,30,30
3,CUST004,Male,37,Clothing,1,500,500
4,CUST005,Male,30,Beauty,2,50,100


In [12]:
#checking for null values
data.isnull().sum()

Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64

**Hypothesis testing**

In [9]:

from sklearn.preprocessing import LabelEncoder
from scipy.stats import ttest_ind, chi2_contingency, f_oneway, pearsonr




label_encoders = {}
for column in ['Customer ID', 'Gender', 'Product Category']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# T-test: Compare Total Amount between Male and Female customers
male_total = data[data['Gender'] == 1]['Total Amount']
female_total = data[data['Gender'] == 0]['Total Amount']
t_stat, p_value = ttest_ind(male_total, female_total)
print(f'T-test: t-statistic = {t_stat}, p-value = {p_value}')

# Chi-square test: Check independence between Gender and Product Category
contingency_table = pd.crosstab(data['Gender'], data['Product Category'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f'Chi-square test: chi2 = {chi2}, p-value = {p}')

# ANOVA: Compare Total Amount across different Product Categories
categories = data['Product Category'].unique()
category_data = [data[data['Product Category'] == category]['Total Amount'] for category in categories]
f_stat, p_value = f_oneway(*category_data)
print(f'ANOVA: F-statistic = {f_stat}, p-value = {p_value}')

# Correlation: Check correlation between Age and Total Amount
corr, p_value = pearsonr(data['Age'], data['Total Amount'])
print(f'Correlation: correlation coefficient = {corr}, p-value = {p_value}')


T-test: t-statistic = -0.03161341824319852, p-value = 0.9747866634918314
Chi-square test: chi2 = 1.673837085800602, p-value = 0.43304287262068974
ANOVA: F-statistic = 0.159430595139859, p-value = 0.8526508750301867
Correlation: correlation coefficient = -0.06056802388304562, p-value = 0.05553184763493584


Interpretation of Results:

**T-test:** A low p-value (typically < 0.05) indicates a significant difference in the means of Total Amount spent by Male and Female customers.
**Chi-square test**: A low p-value suggests a significant association between Gender and Product Category.
**ANOVA:** A low p-value indicates significant differences in the means of Total Amount across different Product Categories.
**Correlation**: A high absolute value of the correlation coefficient (close to 1 or -1) and a low p-value suggest a strong correlation between Age and Total Amount.

****

**Pycaret**

PyCaret is an open-source, low-code machine learning library in Python that automates machine learning workflows. It is designed to reduce the time required to develop and deploy machine learning models. PyCaret is particularly useful for data scientists and analysts who want to perform complex machine learning tasks with minimal coding

In [1]:
pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.0.tar.gz (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.0/165.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from p

In [4]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pycaret.regression import setup, compare_models, predict_model



# Encode categorical variables
label_encoders = {}
for column in ['Customer ID', 'Gender', 'Product Category']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Separate features and target
X = data.drop(columns=['Total Amount'])
y = data['Total Amount']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the PyCaret regression setup
reg = setup(data=pd.concat([X_train, y_train], axis=1), target='Total Amount', verbose=False)

# Compare different models and find the best one
best_model = compare_models()

# Evaluate the best model on the test set
predictions = predict_model(best_model, data=X_test)
print(predictions)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.025
et,Extra Trees Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.136
gbr,Gradient Boosting Regressor,1.1169,3.1301,1.6508,1.0,0.0131,0.0085,0.112
lightgbm,Light Gradient Boosting Machine,1.0775,11.9811,2.6604,1.0,0.0192,0.0073,0.261
xgboost,Extreme Gradient Boosting,0.0001,0.0,0.0002,1.0,0.0,0.0,0.069
rf,Random Forest Regressor,0.0,0.0,0.0,1.0,0.0,0.0,0.203
ada,AdaBoost Regressor,77.8089,13248.1689,114.3382,0.9584,0.2402,0.2255,0.044
lasso,Lasso Regression,177.9941,47028.777,216.2009,0.8519,0.9534,1.6862,0.025
llar,Lasso Least Angle Regression,177.9941,47028.7758,216.2009,0.8519,0.9534,1.6862,0.038
br,Bayesian Ridge,177.7935,47126.2343,216.4123,0.8516,0.9384,1.6724,0.06


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

     Customer ID  Gender  Age  Product Category  Quantity  Price per Unit  \
521          522       1   46                 0         3             500   
737          738       1   41                 1         2              50   
740          741       1   48                 1         1             300   
660          661       0   44                 1         4              25   
411          412       0   19                 2         4             500   
..           ...     ...  ...               ...       ...             ...   
408          409       0   21                 2         3             300   
332          333       0   54                 2         4             300   
208          209       0   30                 2         4              50   
613          614       0   39                 0         4             300   
78            78       1   34                 0         1             300   

     prediction_label  
521            1500.0  
737             100.0  
740

**AutoMl library -tpot**

In [5]:
pip install tpot


Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25l[?25hdone
  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11938 sha256=d1f6477161421803d96ec931ee6b733f64adfb8adfa4fa91d4aa1eeb71891866
  Stored in directory: /r

In [8]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTRegressor

label_encoders = {}
for column in ['Customer ID', 'Gender', 'Product Category']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le


X = data.drop(columns=['Total Amount'])
y = data['Total Amount']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


tpot = TPOTRegressor(verbosity=2, generations=5, population_size=50, random_state=42)


tpot.fit(X_train, y_train)


print(tpot.score(X_test, y_test))

# Export the pipeline
tpot.export('tpot_best_pipeline.py')


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.0

Generation 2 - Current best internal CV score: 0.0

Generation 3 - Current best internal CV score: 0.0

Generation 4 - Current best internal CV score: 0.0

Generation 5 - Current best internal CV score: 0.0

Best pipeline: DecisionTreeRegressor(input_matrix, max_depth=6, min_samples_leaf=17, min_samples_split=5)
-0.0
