# Statistical Analysis using Python

## 1. Extracting and cleaning data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

missing_values = ["n/a", "na", "--"]

# Read files
customers = pd.read_csv("customers.csv", delimiter=",", na_values = missing_values)
products = pd.read_csv("products.csv", delimiter=",", na_values = missing_values)
transactions = pd.read_csv("transactions.csv", delimiter=",", na_values = missing_values)

# Check for missing values
print(f"Missing values in customers : {customers.isnull().sum().sum()}")
print(f"Missing values in products : {products.isnull().sum().sum()}")
print(f"Missing values in transactions: {transactions.isnull().sum().sum()}")


Missing values in customers : 0
Missing values in products : 0
Missing values in transactions: 0


In [2]:
# Display the first rows of customers
customers.head()

Unnamed: 0,client_id,sex,birth
0,c_4410,f,1967
1,c_7839,f,1975
2,c_1699,f,1984
3,c_5961,f,1962
4,c_5320,m,1943


In [3]:
# Check for outliers in customers

# Gender/Sex
genders = customers.groupby(["sex"]).size()
print("There are only two types of genders/sexes in the data:")
print(genders) 

# Age
customers['age'] = dt.datetime.now().year - customers['birth']
min_age = customers['age'].min()
max_age = customers['age'].max()
print(f"The youngest customer is {min_age} years old, the oldest is {max_age}")

customers.head()

There are only two types of genders/sexes in the data:
sex
f    4491
m    4132
dtype: int64
The youngest customer is 19 years old, the oldest is 94


Unnamed: 0,client_id,sex,birth,age
0,c_4410,f,1967,56
1,c_7839,f,1975,48
2,c_1699,f,1984,39
3,c_5961,f,1962,61
4,c_5320,m,1943,80


In [4]:
# Display the first rows of products
products.head()

Unnamed: 0,id_prod,price,categ
0,0_1421,19.99,0
1,0_1368,5.13,0
2,0_731,17.99,0
3,1_587,4.99,1
4,0_1507,3.99,0


In [5]:
# Display the first rows of transactions
transactions.head()

Unnamed: 0,id_prod,date,session_id,client_id
0,0_1483,2021-04-10 18:37:28.723910,s_18746,c_4450
1,2_226,2022-02-03 01:55:53.276402,s_159142,c_277
2,1_374,2021-09-23 15:13:46.938559,s_94290,c_4270
3,0_2186,2021-10-17 03:27:18.783634,s_105936,c_4597
4,0_1351,2021-07-17 20:34:25.800563,s_63642,c_1242


## 2. Statistical analysis for commercial purposes

- Central tendency indicators
- Dispersion indicators
- Concentration using Lorenz curve and Gini index
- Several bivariate analyses (age/gender)

1. Central tendency indicators : mean and median

In [6]:
# For customers' ages
mean_age = customers['age'].mean()
median_age = customers['age'].median()
print("For customers (age):")
print(f"mean = {mean_age}")
print(f"median = {median_age}")

For customers (age):
mean = 44.71912327496231
median = 44.0


In [7]:
# For products' prices
mean_price = products['price'].mean()
median_price = products['price'].median()

print("For products (price):")
print(f"mean = {mean_price}")
print(f"median = {median_price}")

For products (price):
mean = 21.85664131426833
median = 13.06


## 3. Statistical analysis for marketing purposes

Check for correlations between:
- the gender of customers and categories of products purchased
- the age of the customers and the average basket (number of articles and total amount)
- the age of customers and the categories of products purchased
- the age of customers and the frequency of purchase