In [None]:
import pandas as pd

customers_df = pd.read_csv('customers.csv')
sales_df = pd.read_csv('sales.csv')

print("First few rows of Customers Dataset:")
print(customers_df.head())

print("\nFirst few rows of Sales Dataset:")
print(sales_df.head())

print("\nShape of Customers Dataset (rows, columns):", customers_df.shape)
print("Shape of Sales Dataset (rows, columns):", sales_df.shape)

print("\nMissing values in Customers Dataset:")
print(customers_df.isnull().sum())

print("\nMissing values in Sales Dataset:")
print(sales_df.isnull().sum())


In [None]:
customers_dict = customers_df.to_dict(orient='records')

city_to_filter = "New York"
filtered_dict = [customer for customer in customers_dict if customer['City'] == city_to_filter]

print(f"Number of customers from {city_to_filter} using dictionary: {len(filtered_dict)}")

filtered_df = customers_df[customers_df['City'] == city_to_filter]

print(f"\nNumber of customers from {city_to_filter} using DataFrame: {filtered_df.shape[0]}")
print("\nFiltered Data (Using DataFrame):")
print(filtered_df)


In [None]:
import time

city_to_filter = "New York"

start_dict = time.time()
filtered_dict = [customer for customer in customers_dict if customer['City'] == city_to_filter]
end_dict = time.time()

time_dict = end_dict - start_dict
print(f"Time taken for filtering using dictionary: {time_dict:.6f} seconds")

start_df = time.time()
filtered_df = customers_df[customers_df['City'] == city_to_filter]
end_df = time.time()

time_df = end_df - start_df
print(f"Time taken for filtering using DataFrame: {time_df:.6f} seconds")

if time_dict < time_df:
    print("\nDictionary filtering is faster.")
else:
    print("\nDataFrame filtering is faster.")


In [None]:
print("Number of duplicate rows in Customers Dataset:", customers_df.duplicated().sum())

customers_df_cleaned = customers_df.drop_duplicates()
print("\nNumber of duplicate rows in Customers Dataset after cleaning:", customers_df_cleaned.duplicated().sum())

print("\nNumber of duplicate rows in Sales Dataset:", sales_df.duplicated().sum())

sales_df_cleaned = sales_df.drop_duplicates()
print("\nNumber of duplicate rows in Sales Dataset after cleaning:", sales_df_cleaned.duplicated().sum())

print("\nShape of Customers Dataset after cleaning:", customers_df_cleaned.shape)
print("Shape of Sales Dataset after cleaning:", sales_df_cleaned.shape)


In [None]:
sales_df_cleaned['Discounted_Amount'] = sales_df_cleaned['Amount'] * 0.9

product_sales = sales_df_cleaned.groupby('Product')['Discounted_Amount'].sum().reset_index()

product_sales.columns = ['Product', 'Total_Sales']

print("\nSales grouped by Product with total discounted sales:")
print(product_sales)

In [None]:
filtered_customers = customers_df_cleaned[(customers_df_cleaned['Age'] >= 25) & (customers_df_cleaned['Age'] <= 35)]

city_customer_counts = filtered_customers['City'].value_counts().reset_index()

city_customer_counts.columns = ['City', 'Customer_Count']

print("\nFiltered customers aged between 25 and 35:")
print(filtered_customers)

print("\nNumber of customers in each city within the age range of 25 to 35:")
print(city_customer_counts)


In [None]:
merged_df = pd.merge(customers_df_cleaned, sales_df_cleaned, on='CustomerID')

city_sales = merged_df.groupby('City')['Amount'].sum().reset_index()
city_with_highest_sales = city_sales.loc[city_sales['Amount'].idxmax()]

product_sales_count = merged_df.groupby('Product')['SaleID'].count().reset_index()
product_sales_count.columns = ['Product', 'Units_Sold']
product_with_most_units_sold = product_sales_count.loc[product_sales_count['Units_Sold'].idxmax()]

print("City with the highest total sales:")
print(city_with_highest_sales)

print("\nProduct with the most units sold:")
print(product_with_most_units_sold)

In [None]:
unique_cities = merged_df['City'].unique()
unique_products = merged_df['Product'].unique()

mean_amount = merged_df['Amount'].mean()
median_amount = merged_df['Amount'].median()

print("Unique values in the City column:")
print(unique_cities)

print("\nUnique values in the Product column:")
print(unique_products)

print("\nMean of the Amount column:", mean_amount)
print("Median of the Amount column:", median_amount)
