In [179]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.feature_extraction.text import CountVectorizer
import speech_recognition as sr
from enum import Enum

In [180]:
data = pd.read_csv("Book.csv", encoding='iso-8859-1')
print(data.head())
print(data.isnull().sum())
print(data['Language'].value_counts())

                                                Text Language
0   an artificial market using personal vouchers ...  English
1  Ek krutrim bajar jaah? vyaktigata bh???ra upay...     Odia
2  Ek krtrimik bazar jo vyakti gat vauchers ka up...    Hindi
3  Oka krutrima maarketu, idi vyaktigata vocharlu...   Telugu
4  Oru seyyarkai sandhai, idhu thanippatta vaucha...    Tamil
Text        0
Language    0
dtype: int64
Language
English    30
Odia       30
Hindi      30
Telugu     30
Tamil      30
Bengoli    30
Name: count, dtype: int64


In [181]:
# Prepare the data
x = np.array(data['Text'])
y = np.array(data['Language'])

In [182]:
# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [183]:
# Vectorize the text data
cv = CountVectorizer(max_features=5000)  # Limit features to 5000 most frequent words
X = cv.fit_transform(x).toarray()

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.33, random_state=42)

In [185]:
# CNN Model (without Embedding layer)
def create_cnn_model(input_dim):
    model = models.Sequential()
    
    # Add Convolutional Layer
    model.add(layers.Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(input_dim, 1)))  # Input shape added here
    model.add(layers.MaxPooling1D(pool_size=2))
    
    # Add another Convolutional Layer
    model.add(layers.Conv1D(filters=64, kernel_size=5, activation='relu'))
    model.add(layers.MaxPooling1D(pool_size=2))
    
    # Flatten the output of the convolutional layers
    model.add(layers.Flatten())
    
    # Fully Connected Layer
    model.add(layers.Dense(64, activation='relu'))
    
    # Output Layer
    model.add(layers.Dense(len(label_encoder.classes_), activation='softmax'))  # Softmax for multi-class classification
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [186]:
input_dim = X_train.shape[1]

In [187]:
# Reshape data for Conv1D (need a 3D shape: [samples, timesteps, features])
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [188]:
# Create and initialize the model
cnn_model = create_cnn_model(input_dim)
cnn_model.summary()  # Print model architecture

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [189]:
# Train the model
cnn_model.fit(X_train_reshaped, y_train, epochs=3, batch_size=32, validation_data=(X_test_reshaped, y_test))

Epoch 1/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 452ms/step - accuracy: 0.3600 - loss: 1.6588 - val_accuracy: 0.8667 - val_loss: 0.8682
Epoch 2/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 298ms/step - accuracy: 0.9765 - loss: 0.4548 - val_accuracy: 0.8667 - val_loss: 0.3562
Epoch 3/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 310ms/step - accuracy: 0.9860 - loss: 0.0556 - val_accuracy: 0.9167 - val_loss: 0.1784


<keras.src.callbacks.history.History at 0x263140d5730>

In [190]:
# Evaluate the model
accuracy = cnn_model.evaluate(X_test_reshaped, y_test)
print(f"Model Accuracy: {accuracy[1]:.2f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.9132 - loss: 0.1931
Model Accuracy: 0.92


In [191]:
class Language(Enum):
    ENGLISH = 'en-US'
    FRENCH = 'fr-FR'
    GERMAN = 'de-DE'
    ITALIAN = 'it-IT'
    SPANISH = 'es-ES'
    PORTUGUESE = 'pt-BR'
    KOREAN = "ko-KR"
    CHINESE_SIMPLIFIED = "zh-CN"
    CHINESE_TRADITIONAL = "zh-TW"
    JAPANESE = "ja-JP"
    RUSSIAN = "ru-RU"
    POLISH = "pl-PL"
    UKRAINIAN = "uk-UA"
    BULGARIAN = "bg-BG"
    BENGALI = "bn-BD"
    TURKISH = "tr-TR"
    ARABIC = "ar-SA"
    INDONESIAN = "id-ID"
    THAI = "th-TH"
    VIETNAMESE = "vi-VN"
    MALAY = "ms-MY"
    HINDI = "hi-IN"
    PUNJABI = "pa-IN"
    TELUGU = "te-IN"
    GUJARATI = "gu-IN"
    ORIYA = "or-IN"
    MARATHI = "mr-IN"
    SINDHI = "sd-IN"
    TAMIL = "ta-IN"
    KANNADA = "kn-IN"
    MALAYALAM = "ml-IN"
    ASSAMESE = "as-IN"
    ODIA = "or-IN"
    SANSKRIT = "sa-IN"


In [192]:
# Speech to Text Class
class SpeechToText:
    @staticmethod
    def speech_to_text(device_index=1, language=Language.ENGLISH):
        r = sr.Recognizer()
        with sr.Microphone(device_index=device_index) as source:
            print("Recording...")
            audio = r.listen(source)
            print("Recording Complete...")

            try:
                # Transcribe audio to text
                text = r.recognize_google(audio, language=language.value)
                return text
            except sr.UnknownValueError:
                print("Could not understand audio")
                return None
            except sr.RequestError as e:
                print("Request error from Google Speech Recognition service; {0}".format(e))
                return None

In [193]:
# Predict the language from audio
def predict_language_from_audio(device_index=1, language=Language.ENGLISH):
    transcribed_text = SpeechToText.speech_to_text(device_index, language)
    
    if transcribed_text:
        # Transform the transcribed text to feature vector
        data = cv.transform([transcribed_text]).toarray()
        
        # Reshape the data for the model (since CNN expects 3D input)
        data_reshaped = data.reshape((data.shape[0], data.shape[1], 1))
        
        # Get the model's prediction
        output = cnn_model.predict(data_reshaped)
        
        # Get the label with the highest probability
        predicted_label = np.argmax(output)
        
        # Convert to original language name using the label encoder
        predicted_language = label_encoder.inverse_transform([predicted_label])
        
        print(f"Predicted Language: {predicted_language[0]}")
    else:
        print("No transcribed text received.")

In [194]:
# Example usage
if __name__ == "__main__":
    device_index = 1  # Set device index based on available microphones
    language = Language.ENGLISH  # Set language to English for recognition
    predict_language_from_audio(device_index=device_index, language=language)

Recording...
Recording Complete...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step
Predicted Language: Tamil
