### Assignment-12 DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
df = pd.read_csv('adult_with_headers.csv')

In [None]:
print("Dataset Information:")
df.info()
     

In [None]:
unique_values = df.nunique()
print("Number of unique values in each column:")
print(unique_values)

In [None]:
print("\nSummary Statistics:")
df.describe()

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
print("\nData Types:")
print(df.dtypes)

In [None]:
print("\nDuplicate Values:")
print(df.duplicated().sum())

In [None]:
df[df.duplicated()]

In [None]:
df = df.drop_duplicates()

In [None]:
print("\nDuplicate Values:")
print(df.duplicated().sum())
     

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
scaler_standard = StandardScaler()
df[numerical_features] = scaler_standard.fit_transform(df[numerical_features])
scaler_minmax = MinMaxScaler()
df[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])
     
     
     
     

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Initialize one-hot encoder
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)

# Identify categorical columns with less than 5 categories
categorical_columns = df.select_dtypes(include=['object']).columns
columns_to_encode = [col for col in categorical_columns if df[col].nunique() < 5]

# Apply one-hot encoding to selected columns
for col in columns_to_encode:
    print(f"One-hot encoding column: {col}")
    encoded_features = one_hot_encoder.fit_transform(df[[col]])
    # Update column names with encoded feature names
    encoded_feature_names = [f"{col}_{cat}" for cat in one_hot_encoder.categories_[0][1:]]
    encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
    # Drop original categorical column and concatenate encoded features
    df.drop(columns=[col], inplace=True)
    df = pd.concat([df, encoded_df], axis=1)

df.head()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder
label_encoder = LabelEncoder()

# Identify categorical columns with more than 5 categories
categorical_columns = df.select_dtypes(include=['object']).columns
columns_to_encode = [col for col in categorical_columns if df[col].nunique() >= 5]

# Apply label encoding to selected columns
for col in columns_to_encode:
    print(f"Label encoding column: {col}")
    df[col] = label_encoder.fit_transform(df[col])

df.head()

In [None]:
df['income_per_hour'] = (df['income_ >50K'] - df['capital_loss']) / df['hours_per_week']
df['total_capital_gain_loss'] = df['capital_gain'] - df['capital_loss']
df.head()
     

In [None]:
from scipy.stats import skew

# Select numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Calculate skewness for each numerical column
skewness = df[numerical_columns].apply(lambda x: skew(x.dropna()))

# Filter columns with right skewness (skewness > 0)
right_skewed_columns = skewness[skewness > 0].index

# Display columns with right skewness and their skewness values
for col in right_skewed_columns:
    print(f"Column '{col}' has skewness of {skewness[col]} (right skew)")


In [None]:
import plotly.express as px

# Calculate skewness for each numerical column
skewness = df[numerical_columns].apply(lambda x: skew(x.dropna()))

# Filter columns with right skewness (skewness > 0)
right_skewed_columns = skewness[skewness > 0].index

# Create a DataFrame for plotting
skewness_df = pd.DataFrame({'Column': right_skewed_columns, 'Skewness': skewness[right_skewed_columns]})

# Create an interactive bar plot
fig = px.bar(skewness_df, x='Column', y='Skewness',
             title='Skewness of Numerical Columns (Right Skewed)',
             labels={'Skewness': 'Skewness'},
             color='Skewness',
             color_continuous_scale='RdYlBu')

# Update layout for better visualization
fig.update_layout(xaxis_title='Column', yaxis_title='Skewness',
                  coloraxis_colorbar=dict(title='Skewness'))

fig.show()

In [None]:
original_skewness = df['capital_gain'].skew()
print("Original Skewness of 'capital_gain':", original_skewness)
     

In [None]:
df['capital_gain_log'] = np.log1p(df['capital_gain'])
     

In [None]:
transformed_skewness = df['capital_gain_log'].skew()
print("Skewness of 'capital_gain' after Log Transformation:", transformed_skewness)

In [None]:
import plotly.express as px

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

fig = px.box(df, y=numerical_columns, title='Box plot of Numerical Columns')
fig.update_layout(xaxis_title='Column', yaxis_title='Value', yaxis=dict(type='linear'))
fig.show()