# Car Prices Data Science Learning

In this project we use dataset from [Kaggle](https://www.kaggle.com/datasets/syedanwarafridi/vehicle-sales-data).

## Imports and loading data

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
from sklearn.preprocessing import MinMaxScaler
import seaborn as sb
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import geopandas as gpd

In [None]:
df = pd.read_csv('car_prices.csv')
print(df.head())

In [None]:
print(df.info())

In [None]:
print(df.describe())

## Clearing

In [None]:
nan_counts = df.isna().sum()
print(nan_counts)

In [None]:
df['model'].value_counts(dropna=False)

In [None]:
df.nunique()

In [None]:
print(df['color'].unique())

In [None]:
df = df.drop_duplicates()

In [None]:
df = df.loc[df['color'] != '—']

In [None]:
df = df.dropna()
print(df.isna().sum())

In [None]:
df.dtypes

## Scaling (Normalization)

In [None]:
df1 = df[['year', 'condition', 'odometer', 'mmr', 'sellingprice']]
scaler = MinMaxScaler()
scaler.fit(df1)
print(scaler.data_max_)
print(scaler.transform(df1))

scaled_data = scaler.fit_transform(df1)

df2 = pd.DataFrame(scaled_data, columns=df1.columns)

print(df2)


## Data Visualization (Matplotlib)

In [None]:
df['color'].value_counts().plot.bar()

In [None]:
counts = df['make'].value_counts()
top_counts = counts.head(20)
top_counts.plot.bar()

In [None]:
df_ford = df.loc[df['make'] == 'Ford']
ford_colors = df_ford['color'].value_counts()

df_chev = df.loc[df['make'] == 'Chevrolet']
chev_colors = df_chev['color'].value_counts()

colors_comparison = pd.DataFrame(data={'Ford': ford_colors, 'Chevrolet': chev_colors})
colors_comparison.plot.bar()
# print(colors_comparison)

## Data Visualization (Seaborn)

In [None]:
df_sb = df[['year', 'make', 'sellingprice']]
selected_makes = ['Chevrolet', 'Lexus', 'Mazda']
df_sb = df_sb[df_sb['make'].isin(selected_makes)]
df_avg_price = df_sb.groupby(['make', 'year'])['sellingprice'].mean().reset_index()
df_avg_price

In [None]:
sb.set_style("whitegrid")

plt.figure(figsize=(12, 8))
sb.pointplot(data=df_avg_price, x='year', y='sellingprice', hue='make', dodge=False, markers='o', linestyles='--')
plt.title('Point Plot for Sales Prices by Year and Brand')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.xticks(rotation=45)
plt.legend(title='Make', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
total_counts = df['make'].value_counts()

top_three_makes = total_counts.nlargest(3).index

filtered_data = df[df['make'].isin(top_three_makes)]

df_count = filtered_data.groupby(['year', 'make']).size().reset_index(name='count')

sb.set_style("whitegrid")

plt.figure(figsize=(12, 8))
sb.pointplot(data=df_count, x='year', y='count', hue='make', dodge=True, markers='o', linestyles='--')
plt.title('Point Plot for Car Counts by Year for Top 3 Brands')
plt.xlabel('Year')
plt.ylabel('Count of Cars')
plt.xticks(rotation=45)
plt.legend(title='Make', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## Pie charts

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]], subplot_titles=('Transmission type', 'Brand'))

fig.add_trace(go.Pie(values=df['transmission'].value_counts(), 
                     labels=df['transmission'].unique(),
                     marker=dict(colors=px.colors.sequential.Bluered),
                     textposition='inside',
                     textinfo='percent+label+value',
                     textfont_size=12,
                     showlegend=False),
                     1, 1)

fig.add_trace(go.Pie(values=df['make'].value_counts(), 
                     labels=df['make'].unique(),
                     marker=dict(colors=px.colors.sequential.RdBu),
                     textposition='inside',
                     textinfo='percent+label',
                     textfont_size=12,
                     showlegend=True),
                     1, 2)


fig.show()


In [None]:
manual_cars = df[df['transmission'] == 'manual']

fig = make_subplots(rows=1, cols=1, specs=[[{'type':'domain'}]])

fig.add_trace(go.Pie(values=manual_cars['make'].value_counts(), 
                     labels=manual_cars['make'].value_counts().index,
                     marker=dict(colors=px.colors.sequential.RdBu),
                     textposition='inside',
                     textinfo='percent+label',
                     textfont_size=12,
                     showlegend=True),
                     1, 1)

fig.update_layout(title_text='Market share by brand among manual transmission cars')

fig.show()

## Scatter plot

In [None]:
sb.scatterplot(df, x="odometer", y="sellingprice")
plt.xlabel("Odometer [km]")
plt.ylabel("Selling Price[$]")
plt.title("Selling Price vs. Odometer Reading")
plt.grid(True)

plt.show()

In [None]:
sb.scatterplot(df, x="mmr", y="sellingprice")
plt.xlabel("MMR")
plt.ylabel("Selling Price[$]")
plt.title("Selling Price vs. MMR")
plt.grid(True)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sb.scatterplot(df, x="mmr", y="sellingprice", ax=ax[0])
sb.scatterplot(df, x="odometer", y="sellingprice", ax=ax[1])

plt.xlabel("MMR")
plt.ylabel("Selling Price[$]")
plt.xlabel("Odometer [km]")
plt.ylabel("Selling Price[$]")
plt.suptitle("Selling Price vs. MMR and Odometer")
plt.tight_layout()

plt.show()

## Geo charts


In [None]:
url = "shapefiles/States_shapefile.shp"
gdf_states = gpd.read_file(url)

df_copy = df.copy()

df_copy.loc[:, 'state'] = df['state'].str.upper()

states = df_copy['state'].value_counts()
all_states = gdf_states['State_Code'].unique()

brand_counts = df_copy.groupby(['state', 'make']).size().reset_index(name='counts')

most_popular_brand = brand_counts.groupby('state').apply(lambda x: x.loc[x['counts'].idxmax()]).reset_index(drop=True)

merged_data = gdf_states.merge(most_popular_brand, left_on='State_Code', right_on='state', how='left')
merged_data['make'] = merged_data['make']

fig, ax = plt.subplots(1, 1, figsize=(20, 20))
gdf_states.boundary.plot(ax=ax, aspect=1, linewidth=.8, color='black')
merged_data.plot(column='make', legend=True, cmap='Set3', ax=ax, aspect=1)

plt.title('Most popular brand in each state')
plt.show()

In [None]:
url = "shapefiles/States_shapefile.shp"
gdf_states = gpd.read_file(url)

df_copy = df.copy()
df_copy['state'] = df_copy['state'].str.upper()

model_counts = df_copy.groupby(['state', 'make', 'model']).size().reset_index(name='counts')

most_popular_model = model_counts.groupby('state').apply(lambda x: x.loc[x['counts'].idxmax()]).reset_index(drop=True)

most_popular_model['make_model'] = most_popular_model['make'] + " " + most_popular_model['model']

merged_data = gdf_states.merge(most_popular_model, left_on='State_Code', right_on='state', how='left')

fig, ax = plt.subplots(1, 1, figsize=(20, 20))
gdf_states.boundary.plot(ax=ax, aspect=1, linewidth=.8, color='black')
merged_data.plot(column='make_model', legend=True, cmap='Set3', ax=ax, aspect=1)

plt.title('Most popular car model in each state')
plt.show()