### Goal:
## End-to-end mini project on `CollegePlacement.csv`: load, explore, clean, analyze placement outcomes, and visualize insights.
BMW sales data (2010-2024) (1).csv
CollegePlacement.csv
global_population_stats_2024.csv
Automobile_data.csv
Student data 4.csv

### 1. Why This Project?

- Understand factors affecting placement outcomes
- Clean and standardize numeric/categorical columns
- Visualize relationships and derive insights
- Reusable template for CSV classification-like datasets


In [None]:
# 2. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")


In [None]:
# 3. Load Dataset
csv_path = "CollegePlacement.csv"

df = pd.read_csv(csv_path)
df.head()


In [None]:
# 4. Explore Data
print("Shape:", df.shape)
print()
print(df.info())
print()
print(df.describe(include='all'))
print()
print("Missing values per column:\n", df.isnull().sum())


In [None]:
# 5. Data Cleaning (generic)
# Numeric columns: fill missing with median
numeric_cols = df.select_dtypes(include=["number"]).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical columns: fill missing with mode
categorical_cols = df.select_dtypes(exclude=["number"]).columns
for col in categorical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Drop duplicate rows if any
df = df.drop_duplicates().reset_index(drop=True)

df.head()


In [None]:
# 6. Analysis (generic)
# Simple statistics on numeric columns
stats = df[numeric_cols].agg(['mean','median','std','min','max']).T
print(stats)

# Correlation among numeric features
corr = df[numeric_cols].corr()
print("\nCorrelation matrix (numeric features):\n", corr)


In [None]:
# 7. Visualizations
# 7.1 Histograms for numeric columns
num_cols_to_plot = numeric_cols[:6]
_df = df[num_cols_to_plot]
_df.hist(figsize=(12, 8), bins=15)
plt.suptitle("Histograms (numeric features)")
plt.show()

# 7.2 Boxplots for numeric columns
plt.figure(figsize=(12, 6))
sns.boxplot(data=_df)
plt.title("Boxplots (numeric features)")
plt.xticks(rotation=45)
plt.show()

# 7.3 Heatmap of correlation
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap (numeric features)")
plt.show()


In [None]:
# 8. Save cleaned data (optional)
cleaned_path = "CollegePlacement_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print(f"Saved cleaned data to: {cleaned_path}")
