In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
data=pd.read_csv('RTD.csv')

In [3]:
import pandas as pd
from pandas.api.types import is_numeric_dtype

def dataset_summary(df):
    summary = {}
    categorical_columns = []
    numeric_columns = []
    high_cardinality_columns = []
    
    for column in df.columns:
        col_summary = {}
        col_summary['Type de Données'] = str(df[column].dtype)
        col_summary['Nombre de Valeurs Uniques'] = int(df[column].nunique())
        col_summary['Valeurs Manquantes'] = int(df[column].isnull().sum())
        
        if is_numeric_dtype(df[column]):
            col_summary['Statistiques Descriptives'] = df[column].describe().to_dict()
            numeric_columns.append(column)
        else:
            col_summary['Valeurs Uniques (Exemple)'] = df[column].unique()[:5].tolist()
            categorical_columns.append(column)
            if should_one_hot_encode(df, column):
                col_summary['Encoded'] = 'One-Hot'
            else:
                col_summary['Encoded'] = 'Label or Frequency'
                high_cardinality_columns.append(column)
        
        summary[column] = col_summary
    
    print("Colonnes Catégorielles:")
    print(categorical_columns)
    print("\nColonnes Numériques:")
    print(numeric_columns)
    print("\nColonnes à haute cardinalité:")
    print(high_cardinality_columns)
    
    return summary

def should_one_hot_encode(df, column):
    unique_count = df[column].nunique()
    total_count = len(df[column])
    threshold = 0.05  
    
    if unique_count / total_count < threshold:
        return True
    return False


summary = dataset_summary(df)


for col, specs in summary.items():
    print(f"Colonne: {col}")
    for key, value in specs.items():
        print(f"  {key}: {value}")
    print("\n")


Colonnes Catégorielles:
['Date', 'Customer_Name', 'Product', 'Payment_Method', 'City', 'Store_Type', 'Customer_Category', 'Season', 'Promotion']

Colonnes Numériques:
['Transaction_ID', 'Total_Items', 'Total_Cost', 'Discount_Applied']

Colonnes à haute cardinalité:
['Date', 'Customer_Name', 'Product']
Colonne: Transaction_ID
  Type de Données: int64
  Nombre de Valeurs Uniques: 1000000
  Valeurs Manquantes: 0
  Statistiques Descriptives: {'count': 1000000.0, 'mean': 1000499999.5, 'std': 288675.2789323441, 'min': 1000000000.0, '25%': 1000249999.75, '50%': 1000499999.5, '75%': 1000749999.25, 'max': 1000999999.0}


Colonne: Date
  Type de Données: object
  Nombre de Valeurs Uniques: 996337
  Valeurs Manquantes: 0
  Valeurs Uniques (Exemple): ['2022-01-21 06:27:29', '2023-03-01 13:01:21', '2024-03-21 15:37:04', '2020-10-31 09:59:47', '2020-12-10 00:59:59']
  Encoded: Label or Frequency


Colonne: Customer_Name
  Type de Données: object
  Nombre de Valeurs Uniques: 329738
  Valeurs Manquant

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Charger les données

# Remplacer les valeurs manquantes dans la colonne 'Promotion'
data['Promotion'].fillna('No Promotion', inplace=True)

# Gérer les données catégorielles
high_cardinality_cols = ['Customer_Name', 'Product']
other_categorical_cols = ['Payment_Method', 'City', 'Store_Type', 'Customer_Category', 'Season']

# Encoder les colonnes à haute cardinalité avec LabelEncoder
for col in high_cardinality_cols:
    label_encoder = LabelEncoder()
    data[col] = label_encoder.fit_transform(data[col])

# Créer des variables indicatrices pour les colonnes catégorielles
data = pd.get_dummies(data, columns=other_categorical_cols, drop_first=True)

# Normaliser et mettre à l'échelle les données numériques
numeric_cols = ['Transaction_ID', 'Total_Items', 'Total_Cost']
scaler = MinMaxScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Utiliser les données nettoyées et prétraitées pour l'analyse ultérieure ou l'apprentissage automatique
# ...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Promotion'].fillna('No Promotion', inplace=True)


In [6]:
data

Unnamed: 0,Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Discount_Applied,Promotion,Payment_Method_Credit Card,Payment_Method_Debit Card,...,Customer_Category_Middle-Aged,Customer_Category_Professional,Customer_Category_Retiree,Customer_Category_Senior Citizen,Customer_Category_Student,Customer_Category_Teenager,Customer_Category_Young Adult,Season_Spring,Season_Summer,Season_Winter
0,0.000000,2022-01-21 06:27:29,291925,263250,0.222222,0.701579,True,No Promotion,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0.000001,2023-03-01 13:01:21,223107,226922,0.111111,0.220316,True,BOGO (Buy One Get One),False,False,...,False,True,False,False,False,False,False,False,False,False
2,0.000002,2024-03-21 15:37:04,198167,460884,0.555556,0.384105,True,No Promotion,True,False,...,False,True,False,False,False,False,False,False,False,True
3,0.000003,2020-10-31 09:59:47,239709,485415,0.000000,0.361474,True,No Promotion,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0.000004,2020-12-10 00:59:59,297796,160437,1.000000,0.120211,False,Discount on Selected Items,False,True,...,False,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.999996,2023-03-27 06:12:10,198158,363401,0.000000,0.179684,False,No Promotion,False,True,...,True,False,False,False,False,False,False,False,False,True
999996,0.999997,2022-05-19 05:13:58,106221,98017,0.777778,0.792105,True,Discount on Selected Items,False,False,...,False,False,False,True,False,False,False,True,False,False
999997,0.999998,2021-09-03 13:59:39,73173,529759,0.222222,0.586737,False,No Promotion,True,False,...,False,False,False,False,False,False,False,False,False,True
999998,0.999999,2023-10-17 05:50:40,221384,148135,0.222222,0.194526,True,BOGO (Buy One Get One),False,True,...,False,False,True,False,False,False,False,False,False,True
