# Module 3 Mini-Project â€” Enhanced Employment in India Analysis
Dataset: **Messy_Employment_India_Dataset.csv**
This version includes advanced cleaning, EDA, feature engineering, and ML model.


## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline
pd.set_option('display.max_columns', 300)

csv_file = '/mnt/data/Messy_Employment_India_Dataset.csv'
df = pd.read_csv(csv_file, na_values=['?', 'NA', 'NaN', '', 'null'])
df.head()


## 2. Initial EDA

In [None]:
df.info()

In [None]:
df.describe(include='all')

### Detect numeric & categorical columns

In [None]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()
numeric_cols, categorical_cols


### Histograms of top numeric columns

In [None]:
df[numeric_cols[:5]].hist(bins=40, figsize=(15,7))
plt.show()


### Categorical distributions

In [None]:
for col in categorical_cols[:5]:
    plt.figure(figsize=(6,4))
    df[col].value_counts().head(15).plot(kind='bar')
    plt.title(col)
    plt.show()


### Correlation heatmap

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df[numeric_cols].corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()


## 3. Data Cleaning

### Missing Values Report

In [None]:
df.isna().sum().sort_values(ascending=False)

### Imputation (Median for numeric, Mode for categorical)

In [None]:
df_clean = df.copy()

for col in df_clean.select_dtypes(include=['int64','float64']):
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

for col in df_clean.select_dtypes(include=['object','category','bool']):
    if df_clean[col].isna().any():
        mode = df_clean[col].mode()
        if not mode.empty:
            df_clean[col].fillna(mode.iloc[0], inplace=True)

df_clean.isna().sum()


### Convert numeric-like object columns

In [None]:
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        conv = pd.to_numeric(df_clean[col].str.replace(',',''), errors='ignore')
        if not isinstance(conv, pd.Series):
            continue
        if conv.notna().mean() > 0.7:  # heuristic
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

df_clean.dtypes


## 4. Outlier Detection & Treatment (IQR Capping)

In [None]:
def cap_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return np.clip(series, lower, upper)

for col in df_clean.select_dtypes(include=['int64','float64']):
    df_clean[col] = cap_iqr(df_clean[col])

df_clean.describe()


## 5. Feature Engineering (Advanced)

In [None]:
df_fe = df_clean.copy()

# Experience buckets
if 'Experience' in df_fe.columns:
    df_fe['Experience_Level'] = pd.cut(
        df_fe['Experience'],
        bins=[0,5,10,20,40],
        labels=['Junior','Mid','Senior','Expert'],
        include_lowest=True
    )

# Income per Year of Experience
if {'Income','Experience'}.issubset(df_fe.columns):
    df_fe['Income_per_YearExp'] = df_fe['Income'] / (df_fe['Experience'] + 1)

# Interaction feature
if {'Income','Age'}.issubset(df_fe.columns):
    df_fe['Income_Age_Interaction'] = df_fe['Income'] * df_fe['Age']

df_fe.head()


## 6. Model-Ready Dataset

### Define target variable: **Income**

In [None]:
target_col = 'Income'
X = df_fe.drop(columns=[target_col])
y = df_fe[target_col]


### Encoding Categorical Columns

In [None]:
X_enc = pd.get_dummies(X, drop_first=True)


### Scale Numeric Features

In [None]:
num_cols = X_enc.select_dtypes(include=['int64','float64']).columns

scaler = StandardScaler()
X_enc[num_cols] = scaler.fit_transform(X_enc[num_cols])

X_enc.head()


## 7. Machine Learning Model (Linear Regression)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_enc, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


### Display Model Coefficients

In [None]:
coef_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', ascending=False)

coef_df.head(20)
