In [4]:
import pandas as pd

sales_df = pd.read_csv('sales.csv')
customers_df = pd.read_csv('customers.csv')

print(sales_df.head())
print("\n")
print(customers_df.head())

print("\nSales Dataset - Number of Rows and Columns:")
print(sales_df.shape)

print("\nCustomers Dataset - Number of Rows and Columns:")
print(customers_df.shape)

print("\nSales Dataset - Missing Values:")
print(sales_df.isnull().sum())

print("\nCustomers Dataset - Missing Values:")
print(customers_df.isnull().sum())

sales_df_cleaned = sales_df.dropna()
customers_df_cleaned = customers_df.dropna()

sales_df_filled = sales_df.fillna(0)
customers_df_filled = customers_df.fillna('Unknown')

print("\nSales Dataset - Cleaned (Drop missing rows):")
print(sales_df_cleaned.head())

print("\nCustomers Dataset - Cleaned (Drop missing rows):")
print(customers_df_cleaned.head())


   SaleID  CustomerID     Product  Amount
0     101           1      Laptop     200
1     102           2  Smartphone     500
2     103           3      Tablet     800
3     104           4  Headphones    1100
4     105           5     Monitor    1400


   CustomerID  Age         City
0           1   22     New York
1           2   23  Los Angeles
2           3   24      Chicago
3           4   25      Houston
4           5   26      Phoenix

Sales Dataset - Number of Rows and Columns:
(400, 4)

Customers Dataset - Number of Rows and Columns:
(100, 3)

Sales Dataset - Missing Values:
SaleID        0
CustomerID    0
Product       0
Amount        0
dtype: int64

Customers Dataset - Missing Values:
CustomerID    0
Age           0
City          0
dtype: int64

Sales Dataset - Cleaned (Drop missing rows):
   SaleID  CustomerID     Product  Amount
0     101           1      Laptop     200
1     102           2  Smartphone     500
2     103           3      Tablet     800
3     104           

In [5]:
import pandas as pd
import time

customers_df = pd.read_csv('customers.csv')
customers_dict = customers_df.to_dict(orient='records')

city_to_filter = 'New York'

start_time_dict = time.time()
filtered_dict = [customer for customer in customers_dict if customer['City'] == city_to_filter]
end_time_dict = time.time()
dict_time = end_time_dict - start_time_dict

start_time_df = time.time()
filtered_df = customers_df[customers_df['City'] == city_to_filter]
end_time_df = time.time()
df_time = end_time_df - start_time_df

print(f"Filtered customers (using dictionary): {len(filtered_dict)} customers")
print(f"Filtered customers (using DataFrame): {len(filtered_df)} customers")

print(f"\nTime taken using dictionary: {dict_time:.6f} seconds")
print(f"Time taken using DataFrame: {df_time:.6f} seconds")


Filtered customers (using dictionary): 20 customers
Filtered customers (using DataFrame): 20 customers

Time taken using dictionary: 0.001000 seconds
Time taken using DataFrame: 0.008000 seconds


In [6]:
import pandas as pd

sales_df = pd.read_csv('sales.csv')
customers_df = pd.read_csv('customers.csv')

sales_duplicates = sales_df[sales_df.duplicated()]
customers_duplicates = customers_df[customers_df.duplicated()]

sales_df_cleaned = sales_df.drop_duplicates()
customers_df_cleaned = customers_df.drop_duplicates()

sales_duplicates_removed = sales_df_cleaned[sales_df_cleaned.duplicated()]
customers_duplicates_removed = customers_df_cleaned[customers_df_cleaned.duplicated()]

print(f"Duplicate rows in sales dataset before cleaning: {len(sales_duplicates)}")
print(f"Duplicate rows in customers dataset before cleaning: {len(customers_duplicates)}")
print(f"Duplicate rows in sales dataset after cleaning: {len(sales_duplicates_removed)}")
print(f"Duplicate rows in customers dataset after cleaning: {len(customers_duplicates_removed)}")


Duplicate rows in sales dataset before cleaning: 0
Duplicate rows in customers dataset before cleaning: 0
Duplicate rows in sales dataset after cleaning: 0
Duplicate rows in customers dataset after cleaning: 0


In [7]:
import pandas as pd

sales_df = pd.read_csv('sales.csv')
sales_df['Amount_after_discount'] = sales_df['Amount'] * 0.9
product_sales = sales_df.groupby('Product')['Amount_after_discount'].sum().reset_index()
print(product_sales)


      Product  Amount_after_discount
0  Headphones                79200.0
1      Laptop                14400.0
2     Monitor               100800.0
3  Smartphone                36000.0
4      Tablet                57600.0


In [8]:
customers_df = pd.read_csv('customers.csv')
filtered_customers = customers_df[(customers_df['Age'] >= 25) & (customers_df['Age'] <= 35)]
city_count = filtered_customers['City'].value_counts()
print(filtered_customers)
print(city_count)


    CustomerID  Age         City
3            4   25      Houston
4            5   26      Phoenix
5            6   27     New York
6            7   28  Los Angeles
7            8   29      Chicago
8            9   30      Houston
9           10   31      Phoenix
10          11   32     New York
11          12   33  Los Angeles
12          13   34      Chicago
13          14   35      Houston
33          34   25      Houston
34          35   26      Phoenix
35          36   27     New York
36          37   28  Los Angeles
37          38   29      Chicago
38          39   30      Houston
39          40   31      Phoenix
40          41   32     New York
41          42   33  Los Angeles
42          43   34      Chicago
43          44   35      Houston
63          64   25      Houston
64          65   26      Phoenix
65          66   27     New York
66          67   28  Los Angeles
67          68   29      Chicago
68          69   30      Houston
69          70   31      Phoenix
70        

In [9]:
customers_df = pd.read_csv('customers.csv')
sales_df = pd.read_csv('sales.csv')
merged_df = pd.merge(sales_df, customers_df, on='CustomerID')

city_sales = merged_df.groupby('City')['Amount'].sum().idxmax()
product_sales = merged_df.groupby('Product')['Amount'].sum().idxmax()

print(city_sales)
print(product_sales)


Phoenix
Monitor


In [10]:
unique_cities = merged_df['City'].unique()
unique_products = merged_df['Product'].unique()

mean_amount = merged_df['Amount'].mean()
median_amount = merged_df['Amount'].median()

print(unique_cities)
print(unique_products)
print(mean_amount)
print(median_amount)


['New York' 'Los Angeles' 'Chicago' 'Houston' 'Phoenix']
['Laptop' 'Smartphone' 'Tablet' 'Headphones' 'Monitor']
800.0
800.0
