# Exploratory Data Analysis (EDA) and Data Preprocessing

This notebook performs EDA and preprocessing on the given dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

## Missing Values Analysis

In [None]:
df.isnull().sum()

In [None]:
df.isnull().mean()*100

## Duplicate Records

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

## Data Type Separation

In [None]:
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns
num_cols, cat_cols

## Univariate Analysis

In [None]:
df[num_cols].hist(figsize=(12,10))
plt.show()

In [None]:
for col in cat_cols:
    print(df[col].value_counts())
    print()

## Outlier Detection

In [None]:
for col in num_cols:
    sns.boxplot(x=df[col])
    plt.title(col)
    plt.show()

## Correlation Analysis

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.show()

## Handling Missing Values

In [None]:
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

## Encoding Categorical Variables

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.head()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

## Final Dataset

In [None]:
df_encoded.shape

In [None]:
df_encoded.head()