In [1]:
import pandas as pd
import numpy as np

# Wczytanie danych
file_path = r"E:\Kursy\pliki\raw_data\restaurant_dirty_data.csv"
df = pd.read_csv(file_path)

# Podstawowy podgląd danych
print(df.head())        # pierwsze 5 wierszy
print(df.info())        # informacje o kolumnach i brakujących wartościach
print(df.describe())    # statystyki liczbowe

     Order ID Customer ID     Category             Item  Price  Quantity  \
0  ORD_705844    CUST_092  Side Dishes       Side Salad    3.0       1.0   
1  ORD_338528    CUST_021  Side Dishes  Mashed Potatoes    4.0       3.0   
2  ORD_443849    CUST_029  Main Dishes  Grilled Chicken   15.0       4.0   
3  ORD_630508    CUST_075       Drinks              NaN    NaN       2.0   
4  ORD_648269    CUST_031  Main Dishes    Pasta Alfredo   12.0       4.0   

   Order Total  Order Date  Payment Method  
0          3.0  2023-12-21     Credit Card  
1         12.0  2023-05-19  Digital Wallet  
2         60.0  2023-09-27     Credit Card  
3          5.0  2022-08-09     Credit Card  
4         48.0  2022-05-15            Cash  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17534 entries, 0 to 17533
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order ID        17534 non-null  object 
 1   Customer ID     17534 non

In [2]:
# Utworzenie kopii zapasowej

df = df.copy()

In [3]:
# Sprawdzenie duplikatów

duplicates = df.duplicated().sum()
print(f"Liczba duplikatów: {duplicates}")

Liczba duplikatów: 0


In [4]:
# Sprawdzenie brakujących wartości

print(df.isna().sum())

Order ID             0
Customer ID          0
Category             0
Item              1758
Price              876
Quantity           430
Order Total        430
Order Date           0
Payment Method    1082
dtype: int64


In [5]:
# Uzupełnienie brakujących wartości

df['Item'] = df['Item'].fillna('Unknown')

In [6]:
# Obliczamy trzecią wartość, jeśli mamy dwie podane (quantity, price, order total)

# Jeśli znamy order_total i quantity, ale brakuje price
df.loc[df['Price'].isna() & df['Order Total'].notna() & df['Quantity'].notna(), 'Price'] = (
    df['Order Total'] / df['Quantity']
)

# Jeśli znamy price i quantity, ale brakuje order_total
df.loc[df['Order Total'].isna() & df['Price'].notna() & df['Quantity'].notna(), 'Order Total'] = (
    df['Price'] * df['Quantity']
)

# Jeśli znamy order_total i price, ale brakuje quantity
df.loc[df['Quantity'].isna() & df['Order Total'].notna() & df['Price'].notna(), 'Quantity'] = (
    df['Order Total'] / df['Price']
)

In [7]:
print(df.isna().sum())

Order ID             0
Customer ID          0
Category             0
Item                 0
Price              430
Quantity           430
Order Total        430
Order Date           0
Payment Method    1082
dtype: int64


In [8]:
df['Payment Method'] = df['Payment Method'].fillna('Unknown')

In [9]:
# Usuwamy wiersze, które nie miały ani jednej z trzech wartości

df = df.dropna(subset=['Price', 'Quantity', 'Order Total'], how='all')

In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 17104 entries, 0 to 17533
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order ID        17104 non-null  object 
 1   Customer ID     17104 non-null  object 
 2   Category        17104 non-null  object 
 3   Item            17104 non-null  object 
 4   Price           17104 non-null  float64
 5   Quantity        17104 non-null  float64
 6   Order Total     17104 non-null  float64
 7   Order Date      17104 non-null  object 
 8   Payment Method  17104 non-null  object 
dtypes: float64(3), object(6)
memory usage: 1.3+ MB
None


In [11]:
# Zmieniamy typ danych

df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')
df['Quantity'] = df['Quantity'].astype(int)

In [12]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 17104 entries, 0 to 17533
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Order ID        17104 non-null  object        
 1   Customer ID     17104 non-null  object        
 2   Category        17104 non-null  object        
 3   Item            17104 non-null  object        
 4   Price           17104 non-null  float64       
 5   Quantity        17104 non-null  int64         
 6   Order Total     17104 non-null  float64       
 7   Order Date      17104 non-null  datetime64[ns]
 8   Payment Method  17104 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 1.3+ MB
None


In [13]:
display(df)

Unnamed: 0,Order ID,Customer ID,Category,Item,Price,Quantity,Order Total,Order Date,Payment Method
0,ORD_705844,CUST_092,Side Dishes,Side Salad,3.0,1,3.0,2023-12-21,Credit Card
1,ORD_338528,CUST_021,Side Dishes,Mashed Potatoes,4.0,3,12.0,2023-05-19,Digital Wallet
2,ORD_443849,CUST_029,Main Dishes,Grilled Chicken,15.0,4,60.0,2023-09-27,Credit Card
3,ORD_630508,CUST_075,Drinks,Unknown,2.5,2,5.0,2022-08-09,Credit Card
4,ORD_648269,CUST_031,Main Dishes,Pasta Alfredo,12.0,4,48.0,2022-05-15,Cash
...,...,...,...,...,...,...,...,...,...
17529,ORD_320102,CUST_021,Drinks,Unknown,1.0,4,4.0,2023-12-23,Cash
17530,ORD_974128,CUST_069,Desserts,Ice Cream,5.0,3,15.0,2023-12-18,Cash
17531,ORD_108324,CUST_050,Desserts,Ice Cream,5.0,4,20.0,2022-05-20,Digital Wallet
17532,ORD_612647,CUST_073,Side Dishes,Mashed Potatoes,4.0,2,8.0,2022-01-27,Digital Wallet


In [14]:
# Sprawdzamy ewentualne błędy logiczne

print(df[df['Quantity'] <= 0])
print(df[df['Price'] <= 0])

mask = (df['Order Total'] != df['Price'] * df['Quantity'])
print(df[mask].head())

Empty DataFrame
Columns: [Order ID, Customer ID, Category, Item, Price, Quantity, Order Total, Order Date, Payment Method]
Index: []
Empty DataFrame
Columns: [Order ID, Customer ID, Category, Item, Price, Quantity, Order Total, Order Date, Payment Method]
Index: []
Empty DataFrame
Columns: [Order ID, Customer ID, Category, Item, Price, Quantity, Order Total, Order Date, Payment Method]
Index: []


In [15]:
# Zapisujemy nasz plik jako .csv

df.to_csv(r"E:\Kursy\pliki\restaurant_clean_data.csv", index=False)