In [2]:
import pandas as pd
import numpy as np

In [7]:
sales_data = {
    'Sales_ID': [1001, 1002, 1003, 1004, 1005],
    'Product_Code': ['PROD_A', 'PROD_B', 'PROD_A', 'PROD_C', 'PROD_B'],
    # Varied date formats
    'Revenue': ['150.75', '$200.00', '99.50', '125.00', '75.25'], 
    'Transaction_Date': ['2025-01-01', '01/02/2025', '2025-Jan-03','2025-01-04', '05-01-2025'], 
    'Customer_Rating': [4.5, 3.0, 5.0, 2.5, 4.0],
    # Limited unique values
    'Payment_Method': ['Credit Card', 'Cash', 'Credit Card', 'Debit Card', 'Cash'] 
}
df_sales = pd.DataFrame(sales_data)

In [8]:
print("Original Sales Data:")
print(df_sales)
print("\nOriginal Data Types (using .info()):")
df_sales.info()


Original Sales Data:
   Sales_ID Product_Code  Revenue Transaction_Date  Customer_Rating  \
0      1001       PROD_A   150.75       2025-01-01              4.5   
1      1002       PROD_B  $200.00       01/02/2025              3.0   
2      1003       PROD_A    99.50      2025-Jan-03              5.0   
3      1004       PROD_C   125.00       2025-01-04              2.5   
4      1005       PROD_B    75.25       05-01-2025              4.0   

  Payment_Method  
0    Credit Card  
1           Cash  
2    Credit Card  
3     Debit Card  
4           Cash  

Original Data Types (using .info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sales_ID          5 non-null      int64  
 1   Product_Code      5 non-null      object 
 2   Revenue           5 non-null      object 
 3   Transaction_Date  5 non-null      object 
 4   Customer_Rati

In [9]:
print("Data types using .dtypes:")
print(df_sales.dtypes)

Data types using .dtypes:
Sales_ID              int64
Product_Code         object
Revenue              object
Transaction_Date     object
Customer_Rating     float64
Payment_Method       object
dtype: object


In [None]:
df_converted_basic = df_sales.copy() # Work on a copy
df_converted_basic['Sales_ID'] = df_converted_basic['Sales_ID'].astype(str)

In [11]:
print("\nDataFrame after converting 'Sales_ID' to string:")
print(df_converted_basic)
print("\nData Types after 'Sales_ID' conversion:")
df_converted_basic.info()


DataFrame after converting 'Sales_ID' to string:
  Sales_ID Product_Code  Revenue Transaction_Date  Customer_Rating  \
0     1001       PROD_A   150.75       2025-01-01              4.5   
1     1002       PROD_B  $200.00       01/02/2025              3.0   
2     1003       PROD_A    99.50      2025-Jan-03              5.0   
3     1004       PROD_C   125.00       2025-01-04              2.5   
4     1005       PROD_B    75.25       05-01-2025              4.0   

  Payment_Method  
0    Credit Card  
1           Cash  
2    Credit Card  
3     Debit Card  
4           Cash  

Data Types after 'Sales_ID' conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sales_ID          5 non-null      object 
 1   Product_Code      5 non-null      object 
 2   Revenue           5 non-null      object 
 3   Transaction_Date  5 non-null      

In [None]:
df_cleaned_revenue = df_sales.copy() #
df_cleaned_revenue['Revenue'] = df_cleaned_revenue['Revenue'].str.replace('$', '', regex=False)

print("\nRevenue column after removing '$':")
print(df_cleaned_revenue['Revenue'])


Revenue column after removing '$':
0    150.75
1    200.00
2     99.50
3    125.00
4     75.25
Name: Revenue, dtype: object


In [13]:
df_cleaned_revenue['Revenue'] = pd.to_numeric(df_cleaned_revenue['Revenue'])

In [14]:
print("\nDataFrame after converting 'Revenue' to numeric:")
print(df_cleaned_revenue)
print("\nData Types after 'Revenue' conversion:")
df_cleaned_revenue.info()


DataFrame after converting 'Revenue' to numeric:
   Sales_ID Product_Code  Revenue Transaction_Date  Customer_Rating  \
0      1001       PROD_A   150.75       2025-01-01              4.5   
1      1002       PROD_B   200.00       01/02/2025              3.0   
2      1003       PROD_A    99.50      2025-Jan-03              5.0   
3      1004       PROD_C   125.00       2025-01-04              2.5   
4      1005       PROD_B    75.25       05-01-2025              4.0   

  Payment_Method  
0    Credit Card  
1           Cash  
2    Credit Card  
3     Debit Card  
4           Cash  

Data Types after 'Revenue' conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sales_ID          5 non-null      int64  
 1   Product_Code      5 non-null      object 
 2   Revenue           5 non-null      float64
 3   Transaction_Date  5 non-null 

In [17]:
df_converted_datetime = df_sales.copy()
df_converted_datetime['Transaction_Date'] = \
pd.to_datetime(df_converted_datetime['Transaction_Date'],
               format='mixed', dayfirst=True)


In [19]:
print("\nDataFrame after converting 'Transaction_Date' to datetime:")
print(df_converted_datetime)
print("\nData Types after 'Transaction_Date' conversion:")
df_converted_datetime.info()


DataFrame after converting 'Transaction_Date' to datetime:
   Sales_ID Product_Code  Revenue Transaction_Date  Customer_Rating  \
0      1001       PROD_A   150.75       2025-01-01              4.5   
1      1002       PROD_B  $200.00       2025-02-01              3.0   
2      1003       PROD_A    99.50       2025-01-03              5.0   
3      1004       PROD_C   125.00       2025-01-04              2.5   
4      1005       PROD_B    75.25       2025-01-05              4.0   

  Payment_Method  
0    Credit Card  
1           Cash  
2    Credit Card  
3     Debit Card  
4           Cash  

Data Types after 'Transaction_Date' conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Sales_ID          5 non-null      int64         
 1   Product_Code      5 non-null      object        
 2   Revenue           5 non-null 

In [20]:
df_optimized_category = df_sales.copy()
df_optimized_category['Payment_Method'] = \
df_optimized_category['Payment_Method'].astype('category')

In [21]:
print("\nDataFrame after converting 'Payment_Method' to categorical:")
print(df_optimized_category)
print("\nData Types after 'Payment_Method' conversion:")
df_optimized_category.info()


DataFrame after converting 'Payment_Method' to categorical:
   Sales_ID Product_Code  Revenue Transaction_Date  Customer_Rating  \
0      1001       PROD_A   150.75       2025-01-01              4.5   
1      1002       PROD_B  $200.00       01/02/2025              3.0   
2      1003       PROD_A    99.50      2025-Jan-03              5.0   
3      1004       PROD_C   125.00       2025-01-04              2.5   
4      1005       PROD_B    75.25       05-01-2025              4.0   

  Payment_Method  
0    Credit Card  
1           Cash  
2    Credit Card  
3     Debit Card  
4           Cash  

Data Types after 'Payment_Method' conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Sales_ID          5 non-null      int64   
 1   Product_Code      5 non-null      object  
 2   Revenue           5 non-null      object  
 3   Transa