<a href="https://colab.research.google.com/github/SylviaNice/7162856-G-rez-Git-et-GitHub/blob/main/Projet_Bottleneck_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projet Python - Optimiser la gestion des données pour Bottleneck

In [82]:
# 1. Installation des bibliothèques

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pd.set_option('display.max_columns', None)
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

In [83]:
# 2. Importation, chargement et aperçu des 3 fichiers importés

# Les fichiers ont été téléchargés manuellement et importés dans Colab
erp_file = 'Fichier_erp.xlsx'
web_file = 'Fichier_web.xlsx'
liaison_file = 'fichier_liaison.xlsx'

erp_df = pd.read_excel(erp_file)
web_df = pd.read_excel(web_file)
liaison_df = pd.read_excel(liaison_file)

display(erp_df.head())
display(web_df.head())
display(liaison_df.head())

Unnamed: 0,product_id,onsale_web,price,stock_quantity,stock_status
0,3847,1,24.2,0,outofstock
1,3849,1,34.3,0,outofstock
2,3850,1,20.8,0,outofstock
3,4032,1,14.1,0,outofstock
4,4039,1,46.0,0,outofstock


Unnamed: 0,sku,virtual,downloadable,rating_count,average_rating,total_sales,tax_status,tax_class,post_author,post_date,post_date_gmt,post_content,post_title,post_excerpt,post_status,comment_status,ping_status,post_password,post_name,post_modified,post_modified_gmt,post_content_filtered,post_parent,guid,menu_order,post_type,post_mime_type,comment_count
0,16004.0,0,0,0,0.0,5.0,,,2.0,2018-06-07 16:27:25,2018-06-07 14:27:25,,Château du Couvent Pomerol 2017,,publish,closed,closed,,chateau-du-couvent-pomerol-2017,2020-08-25 18:35:02,2020-08-25 16:35:02,,0.0,https://www.bottle-neck.fr/wp-content/uploads/...,0.0,attachment,image/jpeg,0.0
1,,0,0,0,,,,,,NaT,NaT,,,,,,,,,NaT,NaT,,,,,,,
2,15075.0,0,0,0,0.0,3.0,taxable,,2.0,2018-02-14 15:39:43,2018-02-14 14:39:43,,Parés Baltà Penedès Indigena 2017,Des couleurs et aromes intenses où le fruit et...,publish,closed,closed,,pares-balta-penedes-indigena-2017,2020-08-20 15:35:02,2020-08-20 13:35:02,,0.0,https://www.bottle-neck.fr/?post_type=product&...,0.0,product,,0.0
3,16209.0,0,0,0,0.0,6.0,taxable,,2.0,2018-02-14 17:15:31,2018-02-14 16:15:31,,Maurel Cabardès Tradition 2017,"Un joli nez aux arômes de fruits rouges, de ca...",publish,closed,closed,,maurel-cabardes-tradition-2017,2020-08-05 18:05:03,2020-08-05 16:05:03,,0.0,https://www.bottle-neck.fr/?post_type=product&...,0.0,product,,0.0
4,15763.0,0,0,0,0.0,1.0,,,2.0,2020-01-25 14:08:16,2020-01-25 13:08:16,,Domaine de la Monardière Vacqueyras Vieilles V...,,publish,closed,closed,,domaine-de-la-monardiere-vacqueyras-les-vieill...,2020-08-21 11:35:02,2020-08-21 09:35:02,,0.0,https://www.bottle-neck.fr/wp-content/uploads/...,0.0,attachment,image/jpeg,0.0


Unnamed: 0,product_id,id_web
0,3847,15298
1,3849,15296
2,3850,15300
3,4032,19814
4,4039,19815


In [84]:
# Nombre de lignes et de colonnes de chacun des fichiers

print(erp_df.shape)
print(web_df.shape)
print(liaison_df.shape)

(825, 5)
(1513, 28)
(825, 2)


In [85]:
# Afficher nom et type de chaque colonne pour erp
print(erp_df.dtypes)
print("-" * 40)

# Afficher nom et type de chaque colonne pour web
print(web_df.dtypes)
print("-" * 40)

# Afficher nom et type de chaque colonne pour liaison
print(liaison_df.dtypes)
print("-" * 40)

product_id          int64
onsale_web          int64
price             float64
stock_quantity      int64
stock_status       object
dtype: object
----------------------------------------
sku                              object
virtual                           int64
downloadable                      int64
rating_count                      int64
average_rating                  float64
total_sales                     float64
tax_status                       object
tax_class                       float64
post_author                     float64
post_date                datetime64[ns]
post_date_gmt            datetime64[ns]
post_content                    float64
post_title                       object
post_excerpt                     object
post_status                      object
comment_status                   object
ping_status                      object
post_password                   float64
post_name                        object
post_modified            datetime64[ns]
post_modified_g

In [86]:
# Renommer la colonne (rectification de l'erreur de la stagiaire) et fusionner les 3 fichiers de données

liaison_df = pd.read_excel("fichier_liaison.xlsx")
liaison_df = liaison_df.rename(columns={'id_web': 'SKU'})
print(liaison_df)

     product_id      SKU
0          3847    15298
1          3849    15296
2          3850    15300
3          4032    19814
4          4039    19815
..          ...      ...
820        7203      NaN
821        7204      NaN
822        7247  13127-1
823        7329  14680-1
824        7338    16230

[825 rows x 2 columns]


In [87]:
# Détection des erreurs, incohérences et nettoyage des données

print(erp_df.isnull().sum())
print("-" * 40)

print(web_df.isnull().sum())
print("-" * 40)

print(liaison_df.isnull().sum())
print("-" * 40)

product_id        0
onsale_web        0
price             0
stock_quantity    0
stock_status      0
dtype: int64
----------------------------------------
sku                        85
virtual                     0
downloadable                0
rating_count                0
average_rating             83
total_sales                83
tax_status                797
tax_class                1513
post_author                83
post_date                  83
post_date_gmt              83
post_content             1513
post_title                 83
post_excerpt              797
post_status                83
comment_status             83
ping_status                83
post_password            1513
post_name                  83
post_modified              83
post_modified_gmt          83
post_content_filtered    1513
post_parent                83
guid                       83
menu_order                 83
post_type                  83
post_mime_type            799
comment_count              83
dtype:

In [93]:
## Incohérences de prix : prix négatifs

erp_df[erp_df['price'] <= 0]

Unnamed: 0,product_id,onsale_web,price,stock_quantity,stock_status
469,5017,0,-8.0,0,outofstock
739,6594,0,-1.0,192,instock


In [94]:
print(liaison_df.isna().sum())

product_id     0
SKU           91
dtype: int64


In [95]:
print(liaison_df[liaison_df.isna().any(axis=1)])

     product_id  SKU
19         4055  NaN
49         4090  NaN
50         4092  NaN
119        4195  NaN
131        4209  NaN
..          ...  ...
817        7196  NaN
818        7200  NaN
819        7201  NaN
820        7203  NaN
821        7204  NaN

[91 rows x 2 columns]
