# Sai Pavan Kumar M
# Data Science - Batch January 2024 (Hyderabad) - Assignment 12

# EDA2

In [None]:
# Load necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore as pps
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest

In [None]:
#to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv('adult_with_headers.csv')
df.head(10)

## Data Exploration and Preprocessing

In [None]:
# Display summary statistics
df.describe(include='all')

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Check data types
print(df.dtypes)

In [None]:
# List of numerical columns
numerical_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [None]:
# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[numerical_columns] = standard_scaler.fit_transform(df[numerical_columns])
df_standard_scaled.head()

In [None]:
# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_columns] = minmax_scaler.fit_transform(df[numerical_columns])
df_minmax_scaled.head()

#### Discuss the scenarios where each scaling technique is preferred and why
- Standard Scaling (StandardScaler): This technique transforms the data to have a mean of 0 and a standard deviation of 1. It is preferred when the data follows a normal distribution or when the model assumes that the data is normally distributed (e.g., linear regression, logistic regression).

- Min-Max Scaling (MinMaxScaler): This technique scales the data to a fixed range, usually [0, 1]. It is preferred when the data does not necessarily follow a normal distribution and you want to preserve the relationships of the original data. It is also useful when using algorithms that do not assume any specific distribution of the data, such as k-nearest neighbors and neural networks.s.
ks.


## Encoding Techniques

In [None]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

In [None]:
# Apply One-hot encoding to categorial variables with kess then 5 categories
one_hot_columns = [col for col in categorical_columns if df[col].nunique() < 5]
df_one_hot_encoded  = pd.get_dummies(df, columns=one_hot_columns, drop_first=True)
df_one_hot_encoded .head()

In [None]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply Label Encoding to categorical variables with more than 5 categories
label_encode_columns = [col for col in categorical_columns if df[col].nunique() >= 5]

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Apply Label Encoding
for col in label_encode_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Display the first few rows of the encoded dataframe
df.head()

#### Discuss the pros and cons of One-Hot Encoding and Label Encoding.
##### One-Hot Encoding
- Pros:
    - No Ordinal Relationship Assumption:
        - One-Hot Encoding does not assume any ordinal relationship between the categories.         - It treats each category as an independent entity.
        - Suitable for nominal categorical variables (e.g., color, gender).
    - Avoids Arbitrary Ranking:
        - Prevents assigning arbitrary ranking to categories, which could mislead some algorithms (e.g., linear regression).
- Cons:
    - Increased Dimensionality:
        - Increases the number of features, especially for categorical variables with many categories. This can lead to the curse of dimensionality, making the model more complex and computationally expensive.
    - Sparse Matrix:
        - Results in a sparse matrix, where many values are zeros, which can consume more memory and slow down the computation.

##### Label Encoding
- Pros:
    - Simplicity:
        - Simple and straightforward to implement.
        - Converts categories to integers, which can be easily interpreted by most algorithms.
    - No Increased Dimensionality:
        - Does not increase the dimensionality of the dataset, keeping it compact.
- Cons:
    - Assumes Ordinal Relationship:
        - Assumes an ordinal relationship between the categories, which may not be true for nominal categorical variables. This can mislead algorithms into interpreting these numerical values as having some sort of ranking or order.
    - Potential Bias:
        - Some algorithms might interpret the encoded integers as having inherent ordinal importance, which can introduce bias and affect the model’s performance.

## Feature Engineering

In [None]:
# Create new feature: 'age_group'
# Age groups: 0-18 (Child), 19-35 (Young Adult), 36-60 (Adult), 61+ (Senior)
df_one_hot_encoded['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

In [None]:
# Create new feature: 'capital_diff' (difference between capital gain and capital loss)
df_one_hot_encoded['capital_diff'] = df['capital_gain'] - df['capital_loss']

In [None]:
# Apply log transformation to 'capital_gain' due to its high skewness
df_one_hot_encoded['log_capital_gain'] = np.log1p(df['capital_gain'])

In [None]:
# Check skewness before and after transformation
print("Skewness of 'capital_gain' before transformation:", df['capital_gain'].skew())
print("Skewness of 'log_capital_gain' after transformation:", df_one_hot_encoded['log_capital_gain'].skew())

In [None]:
# Display the first few rows of the modified dataframe
df_one_hot_encoded.head()

## Feature Selection

In [None]:
# Define the Isolation Forest model
clf = IsolationForest(random_state=42, contamination=0.01)  # Contamination is the proportion of outliers

# Fit the Isolation Forest model to numerical features
clf.fit(df[numerical_columns])

# Predict outliers
outliers = clf.predict(df[numerical_columns])

# Remove outliers
df_cleaned = df[outliers != -1]

# Print the shape of the cleaned dataset
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

In [None]:
# Compute Pearson correlation coefficient manually
correlation_matrix = df_cleaned[numerical_columns].corr()

# Square the absolute values of the correlation coefficients to get the PPS
pps_matrix = correlation_matrix.applymap(lambda x: np.square(abs(x)))

print(pps_matrix)