1. Data Exploration and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical

# Load the dataset
df = pd.read_csv("Alphabets_data.csv")

# Display basic information
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nFirst 5 rows of the dataset:")
print(df.head())
print("\nUnique classes (alphabets):", df['letter'].nunique())
print("Class distribution:\n", df['letter'].value_counts())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Separate features (X) and target (y)
X = df.drop('letter', axis=1)
y = df['letter']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded) # One-hot encode for Keras

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nShape of X_scaled:", X_scaled.shape)
print("Shape of y_categorical:", y_categorical.shape)

Dataset Shape: (20000, 17)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  20000 non-null  object
 1   xbox    20000 non-null  int64 
 2   ybox    20000 non-null  int64 
 3   width   20000 non-null  int64 
 4   height  20000 non-null  int64 
 5   onpix   20000 non-null  int64 
 6   xbar    20000 non-null  int64 
 7   ybar    20000 non-null  int64 
 8   x2bar   20000 non-null  int64 
 9   y2bar   20000 non-null  int64 
 10  xybar   20000 non-null  int64 
 11  x2ybar  20000 non-null  int64 
 12  xy2bar  20000 non-null  int64 
 13  xedge   20000 non-null  int64 
 14  xedgey  20000 non-null  int64 
 15  yedge   20000 non-null  int64 
 16  yedgex  20000 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB

First 5 rows of the dataset:
  letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
0    

2. Model Implementation

In [12]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

le=LabelEncoder()
df['letter']=le.fit_transform(df['letter'])
features=df.drop('letter',axis=1)
se=StandardScaler()
se.fit_transform(features)

target=df['letter']

x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.2,random_state=42)
from sklearn.preprocessing import StandardScaler



In [13]:
from keras.models import Sequential
from keras.layers import Dense
model = Sequential([
    Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
score=model.evaluate(x_train,y_train)
score

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.0449 - loss: 3.7961


[3.7961349487304688, 0.044874999672174454]

3. Hyperparameter Tuning

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from kerastuner import RandomSearch

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units1', min_value=32, max_value=128, step=16),
                    activation='relu',
                    input_shape=(x_train.shape[1],)))
    model.add(Dense(units=hp.Int('units2', min_value=16, max_value=64, step=16),
                    activation='relu'))
    model.add(Dense(26, activation='softmax'))  # 26 classes for A-Z
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
        ),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='alphabet_tuning'
)

tuner.search(x_train, y_train, epochs=50, validation_split=0.2)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best units1: {best_hps.get('units1')}, Best units2: {best_hps.get('units2')}, Best learning_rate: {best_hps.get('learning_rate')}")

best_model = tuner.hypermodel.build(best_hps)
best_model.fit(x_train, y_train, epochs=50, validation_split=0.2, batch_size=32)


Reloading Tuner from my_dir\alphabet_tuning\tuner0.json
Best units1: 48, Best units2: 16, Best learning_rate: 0.01
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.3336 - loss: 2.1453 - val_accuracy: 0.5425 - val_loss: 1.4981
Epoch 2/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5730 - loss: 1.3485 - val_accuracy: 0.6597 - val_loss: 1.1414
Epoch 3/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6701 - loss: 1.0664 - val_accuracy: 0.6609 - val_loss: 1.0732
Epoch 4/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6972 - loss: 0.9635 - val_accuracy: 0.7241 - val_loss: 0.8995
Epoch 5/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7215 - loss: 0.8860 - val_accuracy: 0.7281 - val_loss: 0.9034
Epoch 6/50
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7270 - loss: 0.8717 - val_accuracy: 0.7284 - val_loss: 0.8935
Epoch 7/50
[1m400/400[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x223de59d450>

4. Evaluation

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt

# --- Default model evaluation ---
y_pred_probs_default = model.predict(x_test)  # default_model: your original model
y_pred_default = np.argmax(y_pred_probs_default, axis=1)

accuracy_default = accuracy_score(y_test, y_pred_default)
precision_default = precision_score(y_test, y_pred_default, average='weighted')
recall_default = recall_score(y_test, y_pred_default, average='weighted')
f1_default = f1_score(y_test, y_pred_default, average='weighted')

# --- Tuned model evaluation ---
y_pred_probs_tuned = best_model.predict(x_test)  # best_model: tuned model
y_pred_tuned = np.argmax(y_pred_probs_tuned, axis=1)

accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = recall_score(y_test, y_pred_tuned, average='weighted')
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')

# --- Print results ---
print(f"\nDefault Model Performance:")
print(f"Accuracy: {accuracy_default:.4f}, Precision: {precision_default:.4f}, Recall: {recall_default:.4f}, F1-Score: {f1_default:.4f}")

print(f"\nTuned Model Performance:")
print(f"Accuracy: {accuracy_tuned:.4f}, Precision: {precision_tuned:.4f}, Recall: {recall_tuned:.4f}, F1-Score: {f1_tuned:.4f}")

print("\n--- Comparison ---")
print(f"Default Model Accuracy: {accuracy_default:.4f}")
print(f"Tuned Model Accuracy:   {accuracy_tuned:.4f}")

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m  1/125[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 87ms/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Default Model Performance:
Accuracy: 0.0372, Precision: 0.0014, Recall: 0.0372, F1-Score: 0.0027

Tuned Model Performance:
Accuracy: 0.8040, Precision: 0.8188, Recall: 0.8040, F1-Score: 0.8044

--- Comparison ---
Default Model Accuracy: 0.0372
Tuned Model Accuracy:   0.8040
