In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import speech_recognition as sr
from enum import Enum

In [2]:
class Language(Enum):
    ENGLISH = 'en-US'
    FRENCH = 'fr-FR'
    GERMAN = 'de-DE'
    ITALIAN = 'it-IT'
    SPANISH = 'es-ES'
    PORTUGUESE = 'pt-BR'
    KOREAN = "ko-KR"
    CHINESE_SIMPLIFIED = "zh-CN"
    CHINESE_TRADITIONAL = "zh-TW"
    JAPANESE = "ja-JP"
    RUSSIAN = "ru-RU"
    POLISH = "pl-PL"
    UKRAINIAN = "uk-UA"
    BULGARIAN = "bg-BG"
    BENGALI = "bn-BD"
    TURKISH = "tr-TR"
    ARABIC = "ar-SA"
    INDONESIAN = "id-ID"
    THAI = "th-TH"
    VIETNAMESE = "vi-VN"
    MALAY = "ms-MY"
    HINDI = "hi-IN"
    PUNJABI = "pa-IN"
    TELUGU = "te-IN"
    GUJARATI = "gu-IN"
    ORIYA = "or-IN"
    MARATHI = "mr-IN"
    SINDHI = "sd-IN"
    TAMIL = "ta-IN"
    KANNADA = "kn-IN"
    MALAYALAM = "ml-IN"
    ASSAMESE = "as-IN"
    ODIA = "or-IN"
    SANSKRIT = "sa-IN"

In [3]:
# Load the dataset (replace 'nitya.csv' with your actual CSV file)
data = pd.read_csv("Language Detection.csv")
# encoding='ISO-8859-1'

In [4]:
# Display basic information about the dataset
print(data.head())
print(data.isnull().sum())
print(data['Language'].value_counts())
print(data.dtypes)

                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English
Text        0
Language    0
dtype: int64
Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64
Text        object
Language    object
dtype: object


In [5]:
# Prepare features (Text) and labels (Language)
x = np.array(data['Text'])
y = np.array(data['Language'])

In [6]:
cv = CountVectorizer(stop_words='english')  # Stop words removal is optional
X = cv.fit_transform(x)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [8]:
# Define SpeechToText class
class SpeechToText:
    @staticmethod
    def speech_to_text(device_index=1, language=Language.ENGLISH):
        r = sr.Recognizer()
        
        # Use the appropriate microphone device based on device_index
        with sr.Microphone(device_index=device_index) as source:
            print("Recording...")
            audio = r.listen(source)
            print("Recording Complete...")

            try:
                # Transcribe audio to text using Google Speech Recognition
                text = r.recognize_google(audio, language=language.value)
                print(f"Transcribed Text ({language.name}):", text)
                return text
            except sr.UnknownValueError:
                print("Could not understand audio")
                return None
            except sr.RequestError as e:
                print(f"Request error from Google Speech Recognition service; {e}")
                return None

In [9]:
model = SVC(kernel='linear')  # Linear kernel
model.fit(X_train, y_train)

In [10]:
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.91


In [11]:
def predict_language_from_audio(device_index=1, language=Language.ENGLISH):
    transcribed_text = SpeechToText.speech_to_text(device_index, language)
    
    if transcribed_text:
        # Vectorize the transcribed text and predict the language
        data = cv.transform([transcribed_text]).toarray()
        output = model.predict(data)
        print("Predicted Language:", output[0])

In [12]:
if __name__ == "__main__":
    device_index = 1  # Set device index based on available microphones
    language = Language.ENGLISH  # Set the language for speech recognition (e.g., English)
    predict_language_from_audio(device_index=device_index, language=language)

Recording...
Recording Complete...
Transcribed Text (ENGLISH): por
Predicted Language: Spanish
