In [None]:
#Import the amazon.csv dataset into a Pandas DataFrame
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df= pd.read_csv('amazon.csv')


In [None]:
#Display the first few rows and understand the data structure.
print("First 5 rows:")
print(df.head())
print(df.info())
print(df.describe())
print(df.shape)
print(df.dtypes)

In [None]:
#Check for missing values and handle them appropriately.
print("Missing values per column")
print(df.isnull().sum())

#cleaning the rating_count column
print(df['rating_count'].dtypes)
df['rating_count'] = pd.to_numeric(df['rating_count'] ,errors='coerce')
median_rating_count = df['rating_count'].median()
df['rating_count'].fillna(median_rating_count,inplace=True)


In [None]:
# Convert price-related columns (discounted_price, actual_price) to numerical format.


print(df['discounted_price'].dtype)#inspect data type
print(df['actual_price'].dtype)


#display sample values
print(df['discounted_price'].head())


#convert discounted_price
df['discounted_price'] = df['discounted_price'].str.replace('&','', regex=False)
df['discounted_price'] =pd.to_numeric(df['discounted_price'],errors = 'coerce')
print(df['discounted_price'].dtype)

#convert actual_price
df['actual_price'] = df['actual_price'].str.replace('&','',regex=False).str.replace(',','')
df['actual_price'] =pd.to_numeric(df['actual_price'],errors = 'coerce')
print(df['actual_price'].dtype)




In [None]:
#Convert rating_count to integer values (remove commas if present).
print(df['rating_count'].dtype)
print(df['rating_count'].head)

#convert to integer
df['rating_count'] = df['rating_count'].astype(int)
print(df['rating_count'].head)


In [None]:
#Extract relevant product categories from category column.

df['main_category'] = df['category'].str.split('|').str[0]
print(df['main_category'].unique())
  

In [None]:
#Create a new column for discount_amount (actual_price -discounted_price)
df['discount_amount'] = df['actual_price']-df['discounted_price']
print(df[['actual_price','discounted_price','discount_amount']].head)
print(df['discount_amount'].dtype)


In [None]:
#Convert discount_percentage into a numeric value

#display sample values
print(df['discount_percentage'].head)
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '',regex=False)
df['discount_percentage'] = pd.to_numeric(df['discount_percentage'],errors='coerce')
print(df['discount_percentage'].dtype)
print(df['discount_percentage'].head)


In [None]:
#Extract the year of reviews if a timestamp is present.

if 'review_timestamp' in df.columns:
    print("Timestamp column 'review_timestamp' is present")
else:
    print("Timestamp column 'review_timestamp' not present")    

In [None]:
#Show the distribution of ratings using a histogram.

print(df['rating'].dtype)

df['rating'] = pd.to_numeric(df['rating'],errors='coerce')

plt.figure(figsize=(8,8))
plt.hist(df['rating'],edgecolor = 'black')
plt.title('Distributions of Ratings')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

plt.show()

In [None]:
#Find the correlation between discount_percentage and rating
correlation = df['discount_percentage'].corr(df['rating'])
print(f"Correlation between discount_percentage and rating: {correlation}")

In [None]:
#Identify top-selling product categories
df["main_category"] = df["category"].str.split('|').str[0]
category_counts = df["main_category"].value_counts()
top_5_categories = category_counts.head(5)

print(top_5_categories)



In [None]:
#Analyze the relationship between discount and rating_count.

bins = [0, 10, 20, 30, 40, 50, 100]  
labels = ['0-10%', '10-20%', '20-30%', '30-40%', '40-50%', '50%+']
df['discount_range'] = pd.cut(df['discount_percentage'], bins=bins, labels=labels, right=False)

# Create box plot
plt.figure(figsize=(12, 8))
sns.boxplot(x='discount_range', y='rating_count', data=df)
plt.title('Rating Count by Discount Range')
plt.xlabel('Discount Range')
plt.ylabel('Rating Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
#Create a bar chart showing the top 10 highest-rated products.

top_10_products = df.sort_values(by='rating',ascending=False).head(10)
def shorten_name(product_name):
    words = product_name.split()
    if len(words) > 3:
        return ' '.join(words[:3]) + '...'
    else:
        return product_name
top_10_products['short_name'] = top_10_products['product_name'].apply(shorten_name)
    
plt.figure(figsize=(8,6))
plt.bar(top_10_products['short_name'],top_10_products['rating'],color='lightblue')
plt.title("Top 10 Rated Products")
plt.xlabel('Product Name')
plt.xlabel('rating')
plt.xticks(rotation=90) 

plt.show()

In [None]:
#Plot a scatter plot of discount_percentage vs rating.
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount_percentage', y='rating', data=df)
plt.title('Discount Percentage vs. Rating')
plt.xlabel('Discount Percentage (%)')
plt.ylabel('Rating')
plt.grid(True)
plt.show()

In [None]:
#Show the top-selling categories using a pie chart.
top_categories = df.groupby(df['category'].str.split('|').str[0])['rating_count'].sum().nlargest(5)

# Create pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_categories, labels=top_categories.index, autopct='%1.1f%%', startangle=140)
plt.title('Top 5 Selling Categories')
plt.axis('equal')
plt.show()

In [None]:
#Use a heatmap to show correlations between numerical columns
numerical_cols = ['rating', 'rating_count', 'discount_percentage', 'discount_amount', 'actual_price', 'discounted_price']
correlation_matrix = df[numerical_cols].corr()

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='magma', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Columns')
plt.show()


In [None]:
#Based on the analysis, suggest which product categories should be prioritized for discounts.
Answer = "The heatmap suggests that discounts are not a primary driver ratings."

In [None]:
# Recommend strategies to improve sales and customer engagement.
Answer1 = "Ensure products descriptions are accurate and highlight key features and benefits."
Answer2 ="Provide prompt and helpful customer support."


In [None]:
#Discuss any anomalies or patterns found in the data
Answer ="As observed in the heatmap the weak correlation between discount_percentage and ratings",
"suggests customers might be perceiving products with higher discounts as being of lower quality or value."