#Handling Categorical Data

## Encoding Nominal Categorical Features

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

# Load the csv file into a pandas DataFrame
df = pd.read_csv('Prakriti_With_Features.csv')

# Select the 'Complexion' column as our feature
feature = df[['Complexion']].values

# Create a one-hot encoder
one_hot_encoder = LabelBinarizer()

# Fit the encoder and transform the feature
one_hot_encoded = one_hot_encoder.fit_transform(feature)

print("One-hot encoded 'Complexion' feature (first 5 rows):")
print(one_hot_encoded[:5])

# View the classes that were encoded
print("\nEncoded classes:")
print(one_hot_encoder.classes_)

# We can also reverse the transformation
print("\nReversed one-hot encoding (first 5 rows):")
print(one_hot_encoder.inverse_transform(one_hot_encoded)[:5])

### get_dummies

In [None]:
# Create a copy of the DataFrame for this operation
df_dummies = df.copy()

# Apply get_dummies to the 'Complexion' column
complexion_dummies = pd.get_dummies(df_dummies['Complexion'], prefix='Complexion')

# Join the new columns to the original DataFrame
df_dummies = pd.concat([df_dummies, complexion_dummies], axis=1)

print("DataFrame with one-hot encoded 'Complexion' using get_dummies (first 5 rows):")
print(df_dummies[['Complexion', 'Complexion_Dark-Complexion, tans easily', 'Complexion_Fair-skin sunburns easily', 'Complexion_White, pale, tans easily']].head())

## Encoding Ordinal Categorical Features

In [None]:
# Create a copy of the DataFrame for this operation
df_ordinal = df.copy()

# Create a mapping dictionary for the ordered categories
stress_mapper = {'low': 1, 'moderate': 2, 'high': 3}

# Apply the mapping to the 'Stress Levels' column
df_ordinal['Stress Levels_encoded'] = df_ordinal['Stress Levels'].replace(stress_mapper)

print("Original and encoded 'Stress Levels' data (showing a mix of values):")
print(df_ordinal[['Stress Levels', 'Stress Levels_encoded']].sample(5, random_state=42))

## Encoding Dictionaries of Features

In [None]:
from sklearn.feature_extraction import DictVectorizer

# Example data as a list of dictionaries
data_dict = [{'Red': 2, 'Blue': 4},
             {'Red': 4, 'Blue': 3},
             {'Red': 1, 'Yellow': 2},
             {'Red': 2, 'Yellow': 2}]

# Create a DictVectorizer instance
dictvectorizer = DictVectorizer(sparse=False)

# Convert the dictionary to a feature matrix
features = dictvectorizer.fit_transform(data_dict)

print("Feature matrix from dictionary:")
print(features)

# Get the feature names
print("\nFeature names:")
print(dictvectorizer.get_feature_names_out())

## Imputing Missing Class Values

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load the csv file
df = pd.read_csv('Prakriti_With_Features.csv')

# Create a copy and introduce a missing value
df_impute_cat = df.copy()
df_impute_cat.loc[1, 'Hair Color'] = np.nan
df_impute_cat.loc[5, 'Hair Color'] = np.nan

# Create an imputer to replace missing values with the most frequent category
imputer_cat = SimpleImputer(strategy='most_frequent')

# --- CORRECTED LINE ---
# Reshape data, apply the transformation, and flatten the result to 1D
df_impute_cat['Hair Color_imputed'] = imputer_cat.fit_transform(df_impute_cat[['Hair Color']]).ravel()

print("DataFrame after imputing 'Hair Color' (rows 0-6):")
print(df_impute_cat[['Hair Color', 'Hair Color_imputed']].head(7))

## Handling Imbalanced Classes

In [None]:
print("Distribution of classes in the 'Dosha' column:")
print(df['Dosha'].value_counts())

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Using class_weight='balanced' automatically adjusts weights
# inversely proportional to class frequencies
balanced_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
print("Random Forest Classifier with balanced class weights:")
print(balanced_rf)

# You can also manually set weights
# Note: I've updated the weights to be more realistic for this example
manual_weights = {'Vata': 3.0, 'Pitta': 3.0, 'Kapha': 3.0, 'vata+pitta': 0.5, 'pitta+kapha': 2.0, 'vata+kapha': 2.0}
manual_rf = RandomForestClassifier(class_weight=manual_weights, random_state=42)
print("\nRandom Forest Classifier with manual class weights:")
print(manual_rf)

import pandas as pd
from sklearn.utils import resample

# Load the csv file if you haven't already
df = pd.read_csv('Prakriti_With_Features.csv')

# Separate majority and minority classes
df_majority = df[df.Dosha == 'vata+pitta']
df_minority = df[df.Dosha != 'vata+pitta']

# Downsample the majority class to match the size of the 'Vata' class
df_majority_downsampled = resample(df_majority,
                                 replace=False,    # sample without replacement
                                 n_samples=len(df[df.Dosha == 'Vata']), # to match a minority class size
                                 random_state=42)

# Combine minority class DataFrame with downsampled majority class DataFrame
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

print("Value counts after downsampling 'vata+pitta':")
print(df_downsampled.Dosha.value_counts())