### Goal:
## End-to-end mini project on `BMW sales data (2010-2024) (1).csv`: load, explore trends, clean, analyze, and visualize time series patterns.


### 1. Why This Project?

- Understand sales trends over time
- Clean missing entries and standardize numeric fields
- Visualize trends and correlations to derive insights
- Reusable analysis template for CSV sales datasets


In [None]:
# 2. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")


In [None]:
# 3. Load Dataset
csv_path = "BMW sales data (2010-2024) (1).csv"

df = pd.read_csv(csv_path)
df.head()


In [None]:
# 4. Explore Data
print("Shape:", df.shape)
print()
print(df.info())
print()
print(df.describe(include='all'))
print()
print("Missing values per column:\n", df.isnull().sum())


In [None]:
# 5. Data Cleaning (generic)
# Numeric columns: fill missing with median
numeric_cols = df.select_dtypes(include=["number"]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns: fill missing with mode
categorical_cols = df.select_dtypes(exclude=["number"]).columns
for col in categorical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Drop duplicate rows if any
df = df.drop_duplicates().reset_index(drop=True)

df.head()


In [None]:
# 6. Analysis (generic)
# Simple statistics on numeric columns
stats = df[numeric_cols].agg(['mean','median','std','min','max']).T
print(stats)

# Correlation among numeric features
corr = df[numeric_cols].corr()
print("\nCorrelation matrix (numeric features):\n", corr)


In [None]:
# 7. Visualizations
# 7.1 Time series line plot if date-like column exists
# Try to detect a date column
potential_date_cols = [c for c in df.columns if 'date' in c.lower() or 'year' in c.lower() or 'month' in c.lower()]

if potential_date_cols:
    date_col = potential_date_cols[0]
    try:
        df_sorted = df.copy()
        df_sorted[date_col] = pd.to_datetime(df_sorted[date_col], errors='coerce')
        df_sorted = df_sorted.sort_values(by=date_col)
        # choose first numeric to plot
        y_col = next((c for c in numeric_cols if c != date_col), None)
        if y_col is not None:
            plt.figure(figsize=(10,5))
            plt.plot(df_sorted[date_col], df_sorted[y_col], marker='o')
            plt.title(f"{y_col} over {date_col}")
            plt.xlabel(date_col)
            plt.ylabel(y_col)
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
    except Exception as e:
        print("Skipping time series plot:", e)

# 7.2 Correlation heatmap
import numpy as np
corr = df[numeric_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap (numeric features)")
plt.show()


In [None]:
# 8. Save cleaned data (optional)
cleaned_path = "BMW_sales_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"Saved cleaned data to: {cleaned_path}")
