In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lingualsense/merged_dataset.csv
/kaggle/input/lingualsensesingle/tensorflow2/default/1/label_encoder.pkl
/kaggle/input/lingualsensesingle/tensorflow2/default/1/language_detection_gru.h5
/kaggle/input/lingualsensesingle/tensorflow2/default/1/tfidf_vectorizer.pkl


In [2]:
df = pd.read_csv('/kaggle/input/lingualsense/merged_dataset.csv')

In [3]:
df.rename(columns={'Text': 'text', 'Language': 'language'}, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32337 entries, 0 to 32336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      32337 non-null  object
 1   language  32337 non-null  object
dtypes: object(2)
memory usage: 505.4+ KB


In [5]:
df.describe()

Unnamed: 0,text,language
count,32337,32337
unique,32126,30
top,haec commentatio automatice praeparata res ast...,English
freq,48,2385


In [6]:
df.head()

Unnamed: 0,text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [7]:
df

Unnamed: 0,text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
32332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
32333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
32334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
32335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [8]:
df.isna().sum()

text        0
language    0
dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['language_encoded'] = label_encoder.fit_transform(df['language'])

print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

{'Arabic': 0, 'Chinese': 1, 'Danish': 2, 'Dutch': 3, 'English': 4, 'Estonian': 5, 'French': 6, 'German': 7, 'Greek': 8, 'Hindi': 9, 'Indonesian': 10, 'Italian': 11, 'Japanese': 12, 'Kannada': 13, 'Korean': 14, 'Latin': 15, 'Malayalam': 16, 'Persian': 17, 'Portugeese': 18, 'Portugese': 19, 'Pushto': 20, 'Romanian': 21, 'Russian': 22, 'Spanish': 23, 'Swedish': 24, 'Sweedish': 25, 'Tamil': 26, 'Thai': 27, 'Turkish': 28, 'Urdu': 29}


In [10]:
import re

def split_text(text):
    return re.split(r'[.!?]', text)  # Split by punctuation

In [11]:
sample_text = "Hello world! வணக்கம் உலகம்! Bonjour le monde."
chunks = split_text(sample_text)
print(chunks)

['Hello world', ' வணக்கம் உலகம்', ' Bonjour le monde', '']


In [12]:
import pickle
from tensorflow.keras.models import load_model

# Load GRU model
model = load_model('/kaggle/input/lingualsensesingle/tensorflow2/default/1/language_detection_gru.h5')

# Load TF-IDF vectorizer
with open('/kaggle/input/lingualsensesingle/tensorflow2/default/1/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

# Load LabelEncoder
with open('/kaggle/input/lingualsensesingle/tensorflow2/default/1/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)


In [13]:
def detect_language_gru(text):

    chunks = split_text(text)
    detected_languages = []

    for chunk in chunks:
        if chunk.strip():  # Ignore empty chunks
            # Transform chunk into TF-IDF features
            input_tfidf = tfidf.transform([chunk]).toarray()

            # Handle out-of-vocabulary chunks
            if input_tfidf.sum() == 0:
                detected_languages.append("Unknown (OOV)")
            else:
                # Predict language
                prediction = model.predict(input_tfidf)
                predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0]
                detected_languages.append(predicted_label)

    return detected_languages

# Test with sample text
result = detect_language_gru("sebes joseph pereira thomas வணக்கம் உலகம்! Bonjour le monde.")
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
['Swedish', 'French']
