## ROUND 2 BABYY

### Import Stuff

In [5]:
import pandas as pd
import os
import sys

# Force UTF-8 encoding for file operations
os.environ['PYTHONIOENCODING'] = 'utf-8'

# For Jupyter notebooks, this is sufficient
import locale
locale.getpreferredencoding = lambda: "UTF-8"

print("✅ UTF-8 encoding enabled")


data_path = os.path.join('..', 'data', 'raw', 'dataset.csv')
df = pd.read_csv(data_path, encoding='utf-8')
df.head()

✅ UTF-8 encoding enabled


Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


### Code tat does sth

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['language']=encoder.fit_transform(df['language'])
df.head()


Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,4
1,sebes joseph pereira thomas på eng the jesuit...,17
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,19
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,18
4,de spons behoort tot het geslacht haliclona en...,2


### Lang count

In [11]:

len(df['language'].unique())


22

### Joblib

In [12]:
import joblib 
joblib.dump(encoder,'encoder_target.joblib')


['encoder_target.joblib']

### Split dataset

In [13]:


from sklearn.model_selection import train_test_split
data,test = train_test_split(df,test_size=0.15,stratify=df['language'])
train,val = train_test_split(df,test_size=0.15,stratify=df['language'])


### Code that does sth

### Install tensorflow

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional,Dropout,TextVectorization


In [15]:
MAX_WORD=5000
MAX_LEN=200
vectorizer = TextVectorization(
    max_tokens=MAX_WORD,
    output_sequence_length=MAX_LEN,
    output_mode="int"
)

vectorizer.adapt(train['Text'])

model=Sequential(
    [vectorizer,
     Embedding(input_dim=MAX_WORD,output_dim=128,input_length=MAX_LEN),
     Bidirectional(LSTM(128)),
     Dropout(0.3),
     Dense(64,activation='relu'),
     Dense(22,activation='softmax')   
    ]
)




In [16]:

x_train = train['Text'].astype(str).to_numpy()
y_train = train['language'].astype('int32').to_numpy()

x_val = val['Text'].astype(str).to_numpy()
y_val = val['language'].astype('int32').to_numpy()


In [17]:

y_val.shape


(3300,)

### Training the model

In [18]:

from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks  import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint
from tensorflow.keras.metrics import Precision, Recall
model.compile(optimizer=AdamW(learning_rate=1e-3),
           loss='sparse_categorical_crossentropy'   
              ,metrics=['accuracy'])

early=EarlyStopping(monitor='val_loss',patience=5,mode='min')
reduceLr=ReduceLROnPlateau(monitor='val_loss',patience=5,min_lr=1e-5,mode='min')
history=model.fit(x_train,y_train,validation_data=(x_val,y_val),batch_size=64,epochs=20,callbacks=[reduceLr])


Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 348ms/step - accuracy: 0.4745 - loss: 1.6328 - val_accuracy: 0.8000 - val_loss: 0.5822 - learning_rate: 0.0010
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 270ms/step - accuracy: 0.8850 - loss: 0.3506 - val_accuracy: 0.8833 - val_loss: 0.3641 - learning_rate: 0.0010
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 323ms/step - accuracy: 0.9072 - loss: 0.2665 - val_accuracy: 0.7812 - val_loss: 0.6767 - learning_rate: 0.0010
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 296ms/step - accuracy: 0.9101 - loss: 0.2576 - val_accuracy: 0.9221 - val_loss: 0.2206 - learning_rate: 0.0010
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 296ms/step - accuracy: 0.9303 - loss: 0.1719 - val_accuracy: 0.9218 - val_loss: 0.2193 - learning_rate: 0.0010
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [19]:
x_test = test['Text'].astype(str).to_numpy()
y_test= test['language'].astype('int32').to_numpy()


In [20]:
metrics=model.evaluate(x_test,y_test)


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.9485 - loss: 0.1274


In [21]:

print(f'Accuracy: {metrics[1]}',end='\n\n')
print(f'Loss: {metrics[0]}',end='\n\n')


Accuracy: 0.9484848380088806

Loss: 0.12737073004245758



In [22]:
model.save('model_language_identification.keras')


UnicodeEncodeError: 'charmap' codec can't encode character '\u062f' in position 29: character maps to <undefined>

In [None]:

import matplotlib.pyplot as plt

# Assuming 'history' is the variable storing the training history
metrics = ['loss','accuracy']

plt.figure(figsize=(15, 10))

for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i + 1)  # 2 rows, 3 columns of plots
    plt.plot(history.history[metric], label='Train')
    plt.plot(history.history['val_' + metric], label='Validation')
    plt.title(metric.capitalize())
    plt.ylabel(metric.capitalize())
    plt.xlabel('Epoch')
    plt.legend(loc='best')

plt.tight_layout()
plt.show()


### Lang Prediction

In [None]:
from tensorflow.keras.models import load_model
from joblib import load
model=load_model('/kaggle/input/model-and-encoder/tensorflow2/default/1/model_language_identification.keras')
encoder=load('/kaggle/input/model-and-encoder/tensorflow2/default/1/encoder_target.joblib')


In [None]:

from tensorflow import constant
import numpy as np
texts=constant(['''Chaque matin, je me lève tôt pour me promener dans le parc près de chez moi. J'aime écouter le chant des oiseaux et sentir la fraîcheur de l'air. C'est un moment de calme qui me permet de commencer la journée en paix. Ensuite, je prends un petit déjeuner léger avant de commencer mon travail. Ce petit rituel m'aide à rester concentré et de bonne humeur toute la journée.''',
                              '''كل صباح، أستيقظ مبكرًا لأتمشى في الحديقة القريبة من منزلي. أحب الاستماع إلى زقزقة العصافير واستنشاق نسمات الهواء العليل. إنها لحظة هادئة تساعدني على بدء يومي بسلام. بعد ذلك، أتناول فطورًا خفيفًا قبل أن أبدأ عملي. هذا الروتين الصغير يساعدني على البقاء مركزًا وبمزاج جيد طوال اليوم.'''])
y_pred=model.predict(texts)
predicted_indices = np.argmax(y_pred, axis=1)
labels = encoder.inverse_transform(predicted_indices)

for text, label in zip(texts, labels):
    print("Language prediction:", label)
