<h1 style="color:cyan; text-align:center; font-size:250%; font-weight:bold;">
Handling Numerical Data-Encoding</h1>

<h2 style="color: lavender; text-align:left; font-size:130%; font-weight:bold;">
List of the Content:
</h2>
        <ol style="font-size:120%;">
            <li>Binning</li>
            <li>Binnerization</li>
        </ol>

<h2 style="color:gold; text-align:left; font-size:200%; font-weight:bold;">
1.Binning</h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

# Load the dataset
df = sns.load_dataset('titanic')
df = df[['age', 'fare', 'survived']]
df.dropna(inplace=True)

# Split the data into features and target
X = df[['age', 'fare']]  # Features (age and fare)
y = df['survived']       # Target (survived)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function for discretization
def discretize(bins, strategy):
    kbin_age = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    
    trf = ColumnTransformer([
        ('age', kbin_age, [0]),
        ('fare', kbin_fare, [1])
    ])
    
    X_trf = trf.fit_transform(X_train)
    
    # Model training and cross-validation
    model = DecisionTreeClassifier()
    cv_score = np.mean(cross_val_score(model, X_trf, y_train, cv=10, scoring='accuracy'))
    print(f"Cross-Validation Accuracy: {cv_score}")
    
    # Plotting
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    plt.hist(X_train['age'], bins=10, color='blue', alpha=0.7)
    plt.title("Age Before")
    plt.subplot(122)
    plt.hist(X_trf[:, 0], bins=10, color='red', alpha=0.7)
    plt.title("Age After")

    plt.show()
    
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    plt.hist(X_train['fare'], bins=10, color='blue', alpha=0.7)
    plt.title("Fare Before")

    plt.subplot(122)
    plt.hist(X_trf[:, 1], bins=10, color='red', alpha=0.7)
    plt.title("Fare After")

    plt.show()

# Example usage
discretize(5, 'kmeans')

<h2 style="color:gold; text-align:left; font-size:200%; font-weight:bold;">
2.Binnerization</h2> 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

# Load and prepare the dataset
df = sns.load_dataset('titanic')
df=df[['age', 'fare', 'sibsp', 'parch', 'survived']]
df.dropna(inplace=True)

# Create a new feature 'family' by combining 'SibSp' and 'Parch'
df['family'] = df['sibsp'] + df['parch']
df.drop(columns=['sibsp', 'parch'], inplace=True)

# Split the data into features (X) and target (y)
X = df.drop(columns=['survived'])
y = df['survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.preprocessing import Binarizer

# Apply binarization to the 'family' feature
trf = ColumnTransformer([
    ('bin', Binarizer(copy=False), ['family'])
], remainder='passthrough')

X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

# Display the transformed training data
pd.DataFrame(X_train_trf, columns=['family', 'age', 'fare']).head()

# Train a Decision Tree classifier on the binarized data
clf = DecisionTreeClassifier()
clf.fit(X_train_trf, y_train)

# Predict and evaluate the model
y_pred2 = clf.predict(X_test_trf)
print("Accuracy with binarization:", accuracy_score(y_test, y_pred2))

# Cross-validation with binarized data
X_trf = trf.fit_transform(X)
print("Cross-validation score with binarization:", 
      np.mean(cross_val_score(DecisionTreeClassifier(),
                               X_trf, y, cv=10, scoring='accuracy')))