#Handling Numerical Data

## Initial Analysis of the Dataset

In [None]:
import pandas as pd

# Load the csv file into a pandas DataFrame
df = pd.read_csv('Prakriti_With_Features.csv')

# Display the first 5 rows of the DataFrame
print("First 5 rows of the dataset:")
print(df.head())

# Display information about the DataFrame, including data types and non-null values
print("\nDataFrame Info:")
df.info()

## Handling Numerical Data

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Create a copy to avoid modifying the original DataFrame
df_scaled = df.copy()

# Define a mapping for the 'Height' column
height_mapping = {'Short': 0, 'Average': 1, 'Tall': 2}
df_scaled['Height_numeric'] = df_scaled['Height'].map(height_mapping)

# Reshape the data for the scaler
height_data = df_scaled['Height_numeric'].values.reshape(-1, 1)

# Create a scaler and fit it to the data
scaler = MinMaxScaler()
df_scaled['Height_scaled'] = scaler.fit_transform(height_data)

print("Original and scaled 'Height' data:")
print(df_scaled[['Height', 'Height_numeric', 'Height_scaled']].head())

## Standardizing a Feature

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a copy for this operation
df_standardized = df.copy()

# Use the same height mapping
df_standardized['Height_numeric'] = df_standardized['Height'].map(height_mapping)
height_data = df_standardized['Height_numeric'].values.reshape(-1, 1)

# Create a StandardScaler and apply it
standard_scaler = StandardScaler()
df_standardized['Height_standardized'] = standard_scaler.fit_transform(height_data)

print("Original, numeric, and standardized 'Height' data:")
print(df_standardized[['Height', 'Height_numeric', 'Height_standardized']].head())

## Normalizing Observations

In [None]:
from sklearn.preprocessing import Normalizer

# Create a copy for this operation
df_normalized = df.copy()

# Define mappings
height_mapping = {'Short': 0, 'Average': 1, 'Tall': 2}
body_size_mapping = {'Slim': 0, 'Medium': 1, 'Large': 2}

df_normalized['Height_numeric'] = df_normalized['Height'].map(height_mapping)
df_normalized['Body_Size_numeric'] = df_normalized['Body Size'].map(body_size_mapping)

# Select the numerical features
features = df_normalized[['Height_numeric', 'Body_Size_numeric']].values

# Create a Normalizer and apply it
normalizer = Normalizer(norm='l2')
df_normalized[['Height_normalized', 'Body_Size_normalized']] = normalizer.transform(features)

print("Original, numeric, and normalized 'Height' and 'Body Size' data:")
print(df_normalized[['Height', 'Body Size', 'Height_numeric', 'Body_Size_numeric', 'Height_normalized', 'Body_Size_normalized']].head())

## Grouping Observations Using Clustering

In [None]:
from sklearn.cluster import KMeans

# Create a copy for this operation
df_clustered = df.copy()

# Use the same mappings as before
height_mapping = {'Short': 0, 'Average': 1, 'Tall': 2}
body_size_mapping = {'Slim': 0, 'Medium': 1, 'Large': 2}

df_clustered['Height_numeric'] = df_clustered['Height'].map(height_mapping)
df_clustered['Body_Size_numeric'] = df_clustered['Body Size'].map(body_size_mapping)

# Select the features for clustering
features_for_clustering = df_clustered[['Height_numeric', 'Body_Size_numeric']]

# Create a KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)

# Fit the model and predict the clusters
df_clustered['group'] = kmeans.fit_predict(features_for_clustering)

print("Data with assigned cluster groups:")
print(df_clustered[['Height', 'Body Size', 'group']].head())

## Deleting Observations with Missing Values

In [None]:
import numpy as np

# Create a copy with some missing values
df_missing = df.copy()
df_missing.loc[0, 'Body Size'] = np.nan
df_missing.loc[2, 'Height'] = np.nan

print("DataFrame with missing values:")
print(df_missing.head())

# Drop rows with any missing values
df_dropped = df_missing.dropna()

print("\nDataFrame after dropping rows with missing values:")
print(df_dropped.head())

## Imputing Missing Values

In [None]:
from sklearn.impute import SimpleImputer

# Create a copy with missing values
df_to_impute = df.copy()
df_to_impute.loc[0, 'Body Size'] = np.nan
df_to_impute.loc[2, 'Height'] = np.nan

print("DataFrame with missing values:")
print(df_to_impute.head())

# Create an imputer that fills missing values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')

# Apply the imputer to the DataFrame
df_imputed_array = imputer.fit_transform(df_to_impute)
df_imputed = pd.DataFrame(df_imputed_array, columns=df_to_impute.columns)

print("\nDataFrame after imputing missing values:")
print(df_imputed.head())