<a href="https://colab.research.google.com/github/Tushar-rancy/EDA-Assignment/blob/main/EDA_Assignment_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the car dataset (assume 'car.csv' is uploaded)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

car_df = pd.read_csv("car.csv")
car_df.head()

### Q1: What is the average selling price of cars for each dealer, and how does it compare across different dealers?

In [None]:
car_df.groupby('dealer')['selling_price'].mean().sort_values(ascending=False).plot(kind='bar')
plt.title('Average Selling Price by Dealer')
plt.ylabel('Selling Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Q2: Which car brand (Company) has the highest variation in prices, and what does this tell us about the pricing trends?

In [None]:
car_df.groupby('company')['selling_price'].std().sort_values(ascending=False).head(10).plot(kind='bar')
plt.title('Price Variation by Brand')
plt.ylabel('Standard Deviation')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Q3: What is the distribution of car prices for each transmission type, and how do the interquartile ranges compare?

In [None]:
sns.boxplot(data=car_df, x='transmission', y='selling_price')
plt.title('Selling Price by Transmission Type')
plt.show()

### Q4: What is the distribution of car prices across different regions?

In [None]:
sns.boxplot(data=car_df, x='region', y='selling_price')
plt.title('Selling Price by Region')
plt.xticks(rotation=45)
plt.show()

### Q5: What is the distribution of cars based on body styles?

In [None]:
sns.countplot(data=car_df, x='body_style', order=car_df['body_style'].value_counts().index)
plt.title('Count of Cars by Body Style')
plt.xticks(rotation=45)
plt.show()

### Q6: How does the average selling price of cars vary by customer gender and annual income?

In [None]:
car_df.groupby(['customer_gender'])['selling_price'].mean().plot(kind='bar', title='Avg Price by Gender')
plt.ylabel('Selling Price')
plt.show()

sns.scatterplot(data=car_df, x='customer_annual_income', y='selling_price', hue='customer_gender')
plt.title('Price vs Income by Gender')
plt.show()

### Q7: What is the distribution of car prices by region, and how does the number of cars sold vary by region?

In [None]:
sns.violinplot(data=car_df, x='region', y='selling_price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Region')
plt.show()

car_df['region'].value_counts().plot(kind='bar', title='Number of Cars Sold by Region')
plt.ylabel('Count')
plt.show()

### Q8: How does the average car price differ between cars with different engine sizes?

In [None]:
car_df.groupby('engine_size')['selling_price'].mean().plot(kind='bar')
plt.title('Avg Price by Engine Size')
plt.ylabel('Selling Price')
plt.show()

### Q9: How do car prices vary based on the customer’s annual income bracket?

In [None]:
# Create income brackets
car_df['income_bracket'] = pd.qcut(car_df['customer_annual_income'], 4, labels=['Low', 'Medium', 'High', 'Very High'])
sns.boxplot(x='income_bracket', y='selling_price', data=car_df)
plt.title('Car Price by Income Bracket')
plt.show()

### Q10: What are the top 5 car models with the highest number of sales, and how does their price distribution look?

In [None]:
top_models = car_df['model'].value_counts().head(5).index
sns.boxplot(data=car_df[car_df['model'].isin(top_models)], x='model', y='selling_price')
plt.title('Price Distribution of Top 5 Selling Car Models')
plt.show()

### Q11: How does car price vary with engine size across different car colors, and which colors have the highest price variation?

In [None]:
sns.boxplot(data=car_df, x='color', y='selling_price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Color')
plt.show()

sns.scatterplot(data=car_df, x='engine_size', y='selling_price', hue='color')
plt.title('Price vs Engine Size by Color')
plt.show()

### Q12: Is there any seasonal trend in car sales based on the date of sale?

In [None]:
car_df['sale_date'] = pd.to_datetime(car_df['sale_date'])
car_df['sale_month'] = car_df['sale_date'].dt.month
car_df['sale_month'].value_counts().sort_index().plot(kind='bar', title='Monthly Car Sales')
plt.xlabel('Month')
plt.ylabel('Number of Sales')
plt.show()

### Q13: How does the car price distribution change when considering different combinations of body style and transmission type?

In [None]:
sns.boxplot(data=car_df, x='body_style', y='selling_price', hue='transmission')
plt.xticks(rotation=45)
plt.title('Price by Body Style and Transmission')
plt.show()

### Q14: What is the correlation between car price, engine size, and annual income of customers, and how do these features interact?

In [None]:
sns.heatmap(car_df[['selling_price', 'engine_size', 'customer_annual_income']].corr(), annot=True)
plt.title('Correlation Heatmap')
plt.show()

### Q15: How does the average car price vary across different car models and engine types?

In [None]:
car_df.groupby(['model', 'engine_type'])['selling_price'].mean().unstack().plot(kind='bar', figsize=(10, 6))
plt.title('Average Car Price by Model and Engine Type')
plt.ylabel('Selling Price')
plt.tight_layout()
plt.show()

---

In [None]:
# Load the Spotify dataset (assuming 'spotify.csv' is uploaded)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

spotify_df = pd.read_csv("spotify.csv")
spotify_df.head()

### Q1: Check for null values and duplicates, and handle them.

In [None]:
# Null values and duplicates
print(spotify_df.isnull().sum())
spotify_df.dropna(inplace=True)
spotify_df.drop_duplicates(inplace=True)

### Q2: What is the distribution of popularity among the tracks?

In [None]:
# Histogram of popularity
sns.histplot(spotify_df['popularity'], bins=30, kde=True)
plt.title('Track Popularity Distribution')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()

### Q3: Is there a relationship between track popularity and duration?

In [None]:
# Scatter plot
sns.scatterplot(x='duration_ms', y='popularity', data=spotify_df)
plt.title('Popularity vs Duration')
plt.xlabel('Duration (ms)')
plt.ylabel('Popularity')
plt.show()

### Q4: Which artist has the highest number of tracks?

In [None]:
top_artists = spotify_df['artist_name'].value_counts().head(10)
sns.barplot(y=top_artists.index, x=top_artists.values)
plt.title('Top Artists by Number of Tracks')
plt.xlabel('Number of Tracks')
plt.ylabel('Artist')
plt.show()

### Q5: What are the top 5 least popular tracks?

In [None]:
spotify_df[['artist_name', 'track_name', 'popularity']].sort_values(by='popularity').head(5)

### Q6: Among top 5 popular artists, who has the highest average popularity?

In [None]:
top5_artists = spotify_df['artist_name'].value_counts().head(5).index
spotify_df[spotify_df['artist_name'].isin(top5_artists)].groupby('artist_name')['popularity'].mean().sort_values(ascending=False)

### Q7: Most popular track for each of the top 5 artists.

In [None]:
spotify_df[spotify_df['artist_name'].isin(top5_artists)].sort_values(['artist_name', 'popularity'], ascending=[True, False]).groupby('artist_name').head(1)[['artist_name', 'track_name', 'popularity']]

### Q8: Visualize relationships between numerical variables using a pair plot.

In [None]:
import seaborn as sns
sns.pairplot(spotify_df[['popularity', 'duration_ms', 'danceability', 'energy']])
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

### Q9: Does track duration vary significantly across artists?

In [None]:
# Boxplot for duration by top 5 artists
sns.boxplot(x='artist_name', y='duration_ms', data=spotify_df[spotify_df['artist_name'].isin(top5_artists)])
plt.title('Duration by Artist')
plt.xticks(rotation=45)
plt.show()

### Q10: How does popularity distribution vary across different artists?

In [None]:
# Violin plot for popularity
sns.violinplot(x='artist_name', y='popularity', data=spotify_df[spotify_df['artist_name'].isin(top5_artists)])
plt.title('Popularity Distribution by Artist')
plt.xticks(rotation=45)
plt.show()