## Amazon Sales Data ##

In [None]:
#Q1) What is the average rating for each product category?

#Code) 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Split the category by '|' and take the top-level category
df['top_category'] = df['category'].str.split('|').str[0]

# Convert the rating column to numeric (it may contain non-numeric values)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Calculate the average rating for each top-level category
average_ratings = df.groupby('top_category')['rating'].mean().reset_index()

# Sort the result by average rating in descending order
average_ratings_sorted = average_ratings.sort_values(by='rating', ascending=False)

# Display the result
print(average_ratings_sorted)

In [None]:
#Q2) What are the top rating_count products by category?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

# Load the dataset
df = pd.read_csv("amazon.csv")

# Convert 'rating_count' to numeric, as it might contain commas or other non-numeric characters
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce')

# Extract the top-level category
df['top_category'] = df['category'].str.split('|').str[0]

# Find the product with the highest rating count for each top-level category
top_rating_count_products = df.loc[df.groupby('top_category')['rating_count'].idxmax(), 
                                   ['top_category', 'product_name', 'rating_count', 'rating']]

# Sort by top-level category for better readability
top_rating_count_products_sorted = top_rating_count_products.sort_values(by='top_category').reset_index(drop=True)

# Display the result
print(top_rating_count_products_sorted)

In [None]:
#Q3) What is the distribution of discounted prices vs. actual prices?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Remove currency symbols and convert 'discounted_price' and 'actual_price' to numeric
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce')

# Drop any rows with NaN values in either of the price columns
df_clean = df.dropna(subset=['discounted_price', 'actual_price'])

# Plot the distribution of discounted prices and actual prices
plt.figure(figsize=(10, 6))

sns.kdeplot(df_clean['discounted_price'], label='Discounted Price', color='blue', fill=True)
sns.kdeplot(df_clean['actual_price'], label='Actual Price', color='red', fill=True)

# Add labels and title
plt.xlabel('Price (₹)')
plt.ylabel('Density')
plt.title('Distribution of Discounted Prices vs. Actual Prices')
plt.legend()

# Show the plot
plt.show()

In [None]:
#Q4) How does the average discount percentage vary across categories?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Remove currency symbols and convert 'discounted_price' and 'actual_price' to numeric
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce')

# Calculate the discount percentage
df['discount_percentage'] = ((df['actual_price'] - df['discounted_price']) / df['actual_price']) * 100

# Extract the top-level category
df['top_category'] = df['category'].str.split('|').str[0]

# Calculate the average discount percentage for each top-level category
average_discount_by_category = df.groupby('top_category')['discount_percentage'].mean().reset_index()

# Sort by the highest average discount percentage
average_discount_by_category_sorted = average_discount_by_category.sort_values(by='discount_percentage', ascending=False)

# Display the result
print(average_discount_by_category_sorted)

In [None]:
#Q5) What are the most popular product names?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Convert 'rating_count' to numeric, as it might contain commas or other non-numeric characters
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce')

# Sort products by rating count in descending order
most_popular_products = df.sort_values(by='rating_count', ascending=False)

# Select the top N most popular products, for example, top 10
top_n_most_popular = most_popular_products[['product_name', 'rating_count']].head(10)

# Display the result
print(top_n_most_popular)

In [None]:
#Q6) What are the most popular product keywords?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
import re

# Load the dataset
df = pd.read_csv("amazon.csv")

# Convert 'rating_count' to numeric, as it might contain commas or other non-numeric characters
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce')

# Create a list to store all product keywords weighted by their rating_count
keywords = []

# Iterate through each product and tokenize the product name
for _, row in df.iterrows():
    if pd.notna(row['product_name']) and pd.notna(row['rating_count']):
        # Tokenize the product name by splitting on non-word characters (e.g., spaces, punctuation)
        words = re.findall(r'\w+', row['product_name'].lower())
        # Add each word to the keywords list multiplied by the product's rating_count
        keywords.extend(words * int(row['rating_count']))

# Count the frequency of each keyword
keyword_counts = Counter(keywords)

# Get the top 50 most popular keywords
top_n_keywords = keyword_counts.most_common(50)

# Display the result
print(top_n_keywords)

In [None]:
#Q7) What are the most popular product reviews?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Convert 'rating_count' to numeric, as it might contain commas or other non-numeric characters
df['rating_count'] = pd.to_numeric(df['rating_count'].str.replace(',', ''), errors='coerce')

# Sort reviews by rating count in descending order
most_popular_reviews = df.sort_values(by='rating_count', ascending=False)

# Select the top N most popular reviews, for example, top 10
top_n_reviews = most_popular_reviews[['review', 'rating_count']].head(10)

# Display the result
print(top_n_reviews)

In [None]:
#Q8) What is the correlation between discounted_price and rating?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Remove currency symbols and convert 'discounted_price' to numeric
df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹', '').str.replace(',', ''), errors='coerce')

# Convert 'rating' to numeric
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Drop any rows with NaN values in either 'discounted_price' or 'rating'
df_clean = df.dropna(subset=['discounted_price', 'rating'])

# Calculate the Pearson correlation coefficient between discounted_price and rating
correlation = df_clean['discounted_price'].corr(df_clean['rating'])

# Display the correlation
print(f"The correlation between discounted price and rating is: {correlation}")

In [None]:
#Q9) What are the Top 5 categories based on the highest ratings?

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("amazon.csv")

# Convert 'rating' to numeric, handling any potential non-numeric values
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Extract the top-level category
df['top_category'] = df['category'].str.split('|').str[0]

# Calculate the average rating for each top-level category
average_ratings_by_category = df.groupby('top_category')['rating'].mean().reset_index()

# Sort by the highest average rating
top_categories = average_ratings_by_category.sort_values(by='rating', ascending=False).head(5)

# Display the top 5 categories
print(top_categories)

In [None]:
#Q10) Identify any potential areas for improvement or optimization based on the data analysis.

#Code)
# Analyzing potential areas for improvement or optimization based on the dataset

# 1. Analyzing the relationship between discounts and ratings
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Plot the relationship between discount percentage and ratings
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount_percentage', y='rating', data=data)
plt.title('Relationship between Discount Percentage and Product Rating')
plt.xlabel('Discount Percentage (%)')
plt.ylabel('Rating')
plt.grid(True)
plt.show()

# 2. Checking correlation between discount percentage, rating, and prices to identify patterns
correlation_matrix = data[['discount_percentage', 'rating', 'discounted_price', 'actual_price']].corr()

# Display the correlation matrix
correlation_matrix

## Spotify Data Set ##

In [None]:
#Q1) Load the dataframe and ensure data quality by checking for missing values and duplicate rows. Handle missing values and remove duplicate rows if necessary.

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("spotify.csv")

# Display the result
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())

In [None]:
#Q2) What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.

#Code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv("spotify.csv")

plt.hist(df["Popularity"], bins=range(0, 101, 5))
plt.xlabel("Popularity")
plt.ylabel("Frequency")
plt.title("Distribution of Popularity Among Tracks")
plt.show()

In [None]:
#Q3) Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.

#Code)
import pandas as pd
import matplotlib.pyplot as plt

# Create a DataFrame from the provided context (if not already available)
data = [
    ["Drake", "Rich Baby Daddy (feat. Sexyy Red & SZA)", 92, 319191, "1yeB8MUNeLo9Ek1UEpsyz6"],
    # ... Add the rest of the dataset here
]

columns = ["Artist", "Track Name", "Popularity", "Duration (ms)", "Track ID"]
df = pd.read_csv("spotify.csv")

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df["Duration (ms)"], df["Popularity"])
plt.xlabel("Duration (ms)")
plt.ylabel("Popularity")
plt.title("Popularity vs Duration of Tracks")
plt.grid(True)
plt.show()

In [None]:
#Q4) Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot.

#Code)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Create a DataFrame from the provided context (if not already available)
data = [
    ["Drake", "Rich Baby Daddy (feat. Sexyy Red & SZA)", 92, 319191, "1yeB8MUNeLo9Ek1UEpsyz6"],
    # ... Add the rest of the dataset here
]

columns = ["Artist", "Track Name", "Popularity", "Duration (ms)", "Track ID"]
# Load the dataset
df = pd.read_csv("spotify.csv")

# Create the count plot
sns.countplot(x="Artist", data=df)
plt.title("Number of Tracks per Artist")
plt.xlabel("Artist")
plt.ylabel("Count")
plt.show()

In [None]:
#Q5) What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.

#Code)
import pandas as pd

# Create a DataFrame from the provided context (if not already available)
data = [
    ["Drake", "Rich Baby Daddy (feat. Sexyy Red & SZA)", 92, 319191, "1yeB8MUNeLo9Ek1UEpsyz6"],
    # ... Add the rest of the dataset here
]

columns = ["Artist", "Track Name", "Popularity", "Duration (ms)", "Track ID"]
# Load the dataset
df = pd.read_csv("spotify.csv")

# Sort the tracks by popularity and select the least popular 5
least_popular_tracks = df.sort_values("Popularity", ascending=True).head(5)

# Display the result
print("Top 5 Least Popular Tracks:")
print(least_popular_tracks[["Artist", "Track Name"]])

In [12]:
#Q6) Among the top 5 most popular artists, which artist has the highest popularity on average? Calculate and display the average popularity for each artist.

#Code)
import pandas as pd

# Load the dataset
df = pd.read_csv("spotify.csv")

# Calculating the average popularity for each artist
average_popularity = df.groupby('Artist')['Popularity'].mean().sort_values(ascending=False)

# Display the artist with the highest average popularity
top_artist = average_popularity.idxmax()
top_artist_avg_popularity = average_popularity.max()

print(f"The artist with the highest average popularity is {top_artist} with an average popularity of {top_artist_avg_popularity}.")

The artist with the highest average popularity is cassö with an average popularity of 92.0.


In [None]:
#Q7) For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.

#Code)
import pandas as pd

# Load the dataset
df = pd.read_csv("spotify.csv")

# Ensure the data contains the expected columns
print(df.columns)

# Find the top 5 most popular artists
# Assuming 'artist_name' and 'track_name' are the relevant columns
top_artists = df.groupby('artist_name')['popularity'].mean().sort_values(ascending=False).head(5).index

# Find the most popular track for each of the top 5 artists
most_popular_tracks = {}

for artist in top_artists:
    artist_tracks = df[df['artist_name'] == artist]
    most_popular_track = artist_tracks.loc[artist_tracks['popularity'].idxmax()]['track_name']
    most_popular_tracks[artist] = most_popular_track

# Print the results
for artist, track in most_popular_tracks.items():
    print(f"Artist: {artist}, Most Popular Track: {track}")

In [None]:
#Q8) Visualize relationships between multiple numerical variables simultaneously using a pair plot.

#Code)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("spotify.csv")

# Inspect the dataset to identify the numerical columns
print(df.info())

# Select numerical columns for the pair plot (adjust based on your dataset)
# If necessary, manually choose numerical columns, e.g., ['popularity', 'duration_ms', 'tempo', 'loudness']
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Create the pair plot
sns.pairplot(df[numerical_columns])
plt.show()

In [None]:
#Q9) Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot.

#Code)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("spotify.csv")

# Inspect the dataset to check for the relevant columns
print(df.info())

# Create a box plot to visualize the distribution of track durations across different artists
plt.figure(figsize=(15, 8))
sns.boxplot(x='artist_name', y='duration_ms', data=df)
plt.xticks(rotation=90)
plt.title('Distribution of Track Durations Across Artists (Box Plot)')
plt.xlabel('Artist')
plt.ylabel('Duration (ms)')
plt.show()

# Create a violin plot to visualize the same
plt.figure(figsize=(15, 8))
sns.violinplot(x='artist_name', y='duration_ms', data=df)
plt.xticks(rotation=90)
plt.title('Distribution of Track Durations Across Artists (Violin Plot)')
plt.xlabel('Artist')
plt.ylabel('Duration (ms)')
plt.show()

In [10]:
#Q10) How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot.

#Code)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("spotify.csv")

# Inspect the dataset to check for the relevant columns
print(df.info())

# Create a swarm plot to visualize the distribution of track popularity across different artists
plt.figure(figsize=(15, 8))
sns.swarmplot(x='artist_name', y='popularity', data=df)
plt.xticks(rotation=90)
plt.title('Distribution of Track Popularity Across Artists (Swarm Plot)')
plt.xlabel('Artist')
plt.ylabel('Popularity')
plt.show()

# Create a violin plot to visualize the same
plt.figure(figsize=(15, 8))
sns.violinplot(x='artist_name', y='popularity', data=df)
plt.xticks(rotation=90)
plt.title('Distribution of Track Popularity Across Artists (Violin Plot)')
plt.xlabel('Artist')
plt.ylabel('Popularity')
plt.show()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Artist         440 non-null    object
 1   Track Name     440 non-null    object
 2   Popularity     440 non-null    int64 
 3   Duration (ms)  440 non-null    int64 
 4   Track ID       440 non-null    object
dtypes: int64(2), object(3)
memory usage: 17.3+ KB
None


ValueError: Could not interpret value `artist_name` for parameter `x`

<Figure size 1500x800 with 0 Axes>