In [1]:
import os
import time
import pandas as pd
import regex as re
import numpy as np
import kagglehub
import multiprocessing
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
path = kagglehub.dataset_download("basilb2s/language-detection")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/basilb2s/language-detection/versions/1


In [3]:
df = pd.read_csv(os.path.join(path, "Language Detection.csv"))
print(f"{df.head()} \n")
print(f"{df.columns} \n")
print(f"{df['Language'].unique()} \n")
print(f"Number of rows - {(df.shape[0])} \n")

                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English 

Index(['Text', 'Language'], dtype='object') 

['English' 'Malayalam' 'Hindi' 'Tamil' 'Portugeese' 'French' 'Dutch'
 'Spanish' 'Greek' 'Russian' 'Danish' 'Italian' 'Turkish' 'Sweedish'
 'Arabic' 'German' 'Kannada'] 

Number of rows - 10337 



In [4]:
df_filtered = df[df["Text"].str.split().str.len() < 4]
print(df_filtered)

                                    Text Language
347                             Kennedy.  English
750                                GNE).  English
1074                        wasn't able.  English
1100                        how are you?  English
1102                    how's it going?.  English
...                                  ...      ...
10275  ನಾನು ನಿಮ್ಮೊಂದಿಗೆ ಸರಿಯಾಗಿರುತ್ತೇನೆ.  Kannada
10276                 ಕ್ಷಮೆಯಾಚಿಸುತ್ತಿದೆ.  Kannada
10279              ಅದರ ಬಗ್ಗೆ ಚಿಂತಿಸಬೇಡಿ.  Kannada
10280                        ಚಿಂತಿಸಬೇಡಿ.  Kannada
10320                      ನೀನು ತಿನ್ನು.  Kannada

[1156 rows x 2 columns]


In [5]:
df = df[~((df["Text"].str.split().str.len() == 1) & (df["Text"].str.len() < 3))]
df.reset_index(drop=True, inplace=True)

In [6]:
ABBREVIATIONS = set([
    # **English**
    "p.m.", "a.m.", "dr.", "mr.", "mrs.", "u.s.a.", "e.g.", "i.e.", "etc.", "vs.", "fig.", "vol.", "no.", "pp.", "gov.", "dept.", "lt.", "gen.", "inc.", "corp.", "est.", "prof.", "ph.d.", "jr.", "sr.", "st.", "mt.", "rev.", "ft.", "sq.", "yr.", "min.", "sec.",

    # **French**
    "m.", "mme.", "mlle.", "dr.", "av.", "boul.", "ch.", "fig.", "etc.", "p.ex.", "cf.", "ibid.", "op.cit.", "c.-à-d.", "n.b.", "p.j.", "t.s.v.p.", "env.", "gov.", "dir.", "adm.", "prof.", "ph.d.",

    # **German**
    "p.m.", "a.m.", "d.h.", "z.B.", "u.a.", "etc.", "vgl.", "usw.", "bzw.", "ff.", "u.E.", "g.U.", "g.g.A.", "Buchst.", "u.s.w.", "sog.", "u.ä.", "Std.", "evtl.", "Zt.", "Chr.", "u.U.", "o.ä.", "Ltd.", "b.A.", "z.Zt.", "spp.", "sen.", "SA", "k.o.", "jun.", "i.H.v.", "dgl.", "dergl.", "Co.", "zzt.", "usf.", "s.p.a.", "Dkr.", "Corp.", "bzgl.", "BSE",

    # **Spanish**
    "p.ej.", "etc.", "s.a.", "sr.", "sra.", "dr.", "prof.", "pág.", "núm.", "gral.", "av.", "c/", "dpto.", "c.c.", "ud.", "u.d.", "u.s.", "u.v.", "a.c.", "d.c.", "admón.", "corp.",

    # **Portuguese**
    "sr.", "sra.", "dr.", "prof.", "av.", "pág.", "etc.", "ex.", "obs.", "exmo.", "adm.", "corp.", "ilmo.", "u.s.", "u.v.", "a.c.", "d.c.", "n.º", "s.l.", "fasc.",

    # **Dutch**
    "blz.", "bijv.", "ca.", "dhr.", "dr.", "e.d.", "e.v.", "enz.", "fig.", "gem.", "i.h.b.", "m.a.w.", "m.n.", "m.v.g.", "n.a.v.", "nr.", "o.a.", "o.i.", "p.m.", "pag.", "t.o.v.", "t.z.t.", "vlg.", "zgn.", "z.i.", "z.s.m.", "z.v.h.",

    # **Italian**
    "sig.", "sig.ra", "sig.na", "ecc.", "dr.", "prof.", "s.p.a.", "s.r.l.", "es.", "avv.", "ing.", "dott.", "p.zza", "v.le", "c.so", "b.s.", "c.m.", "s.n.c.", "n.b.", "c.c.",

    # **Swedish**
    "bl.a.", "d.v.s.", "m.fl.", "m.m.", "nr.", "o.s.v.", "s.a.s.", "t.ex.", "m.a.o.", "jfr.", "ibid.", "c:a", "p.g.a.", "m.h.t.", "d.g.s.", "d.o.",

    # **Danish**
    "bl.a.", "ca.", "dvs.", "m.fl.", "m.m.", "nr.", "osv.", "t.ex.", "m.a.o.", "jfr.", "ibid.", "c:a", "p.g.a.", "f.eks.", "mht.", "a.s.", "cvr.",

    # **Greek**
    "κλπ.", "π.χ.", "δηλ.", "κ.α.", "ο.ε.", "σ.σ.", "βλ.", "περ.", "σελ.", "κα.", "γ.τ.λ.", "γ.τ.κ.",

    # **Russian**
    "и т.д.", "и др.", "и пр.", "г.", "ул.", "д.", "кв.", "км.", "см.", "т.е.", "напр.", "ср.", "с.г.", "п.р.", "ч.п.", "с.г.", "с.р.",

    # **Turkish**
    "sn.", "dr.", "öğr.", "av.", "doç.", "prof.", "vs.", "ör.", "sf.", "ç.", "müh.", "gen.", "alb.", "uzm.", "şb.",

    # **Malayalam**
    "വി.", "മൂ.", "വി.ക.", "ന.ക.", "ഉപ.", "പൂ.ന.", "ചി.", "പി.", "ബി.", "ടി.", "ഡി.", "ഡി.ആർ.", "വി.ഡി.",

    # **Hindi**
    "डॉ.", "श्री.", "संपा.", "सं.", "संपा.", "नि.", "नि.सं.", "वि.", "वि.वि.", "सं.सं.",

    # **Tamil**
    "செ.", "நா.", "தி.", "பி.", "க.", "மு.", "ச.", "ப.", "ஆ.", "பி.எச்.டி.",

    # **Kannada**
    "ಶ್ರೀ.", "ವಿ.", "ಡಾ.", "ಪ್ರೊ.", "ನೋ.", "ಗ.ಶಿ.", "ಚ.ಚಿ.", "ಸಂಪಾ.", "ಸಂ.", "ವಿ.ವಿ.", "ಅ.ಪ್ರ.",

    # **Arabic**
    "د.", "م.", "أ.", "ج.", "س.", "ك.", "ن.", "ب.", "ش.", "ع.", "هـ."
])

In [7]:
def clean_text(text):
  text = text.lower().strip()

  for abbr in ABBREVIATIONS:
      text = text.replace(abbr, abbr.replace('.', ''))

  text = re.sub(r'[^\p{L}\s]', '', text)

  return text.strip()

df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [8]:
def extract_trigrams(text):
  if len(text) < 3:
      return []
  trigrams = [text[i:i+3] for i in range(len(text)-2)]
  return trigrams

In [9]:
start_time = time.time()

df['Trigrams'] = df['Cleaned_Text'].apply(extract_trigrams)

end_time = time.time()
trigram_time = end_time - start_time
print(f"Time for trigram extraction: {round(trigram_time, 4)} seconds")

Time for trigram extraction: 0.6132 seconds


In [10]:
D = 10000
trigram_map = {}

def generate_vector(trigram):
  if trigram not in trigram_map:
      trigram_map[trigram] = np.random.choice([-1, 1], D)
  return trigram_map[trigram]

def calculate_hypervector(trigrams):
  vector = np.sum([generate_vector(t) for t in trigrams], axis=0)
  return vector / np.linalg.norm(vector)

df_filtered = df[df['Trigrams'].apply(lambda x: len(x) > 0)].copy()

In [11]:
start_time = time.time()

df_filtered.loc[:, 'Hypervector'] = df_filtered['Trigrams'].apply(calculate_hypervector)

end_time = time.time()
trigram_time = end_time - start_time
print(f"Time for training phase: {round(trigram_time, 4)} seconds")

Time for training phase: 45.8002 seconds


In [44]:
def prepare_train_test_split(df, test_size=0.2, random_state=42):
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df['Language'])

    test_invalid_rows = test_df[test_df['Cleaned_Text'].str.split().str.len() < 4]

    train_df = pd.concat([train_df, test_invalid_rows])

    test_df = test_df[test_df['Cleaned_Text'].str.split().str.len() >= 4]

    rows_to_move = len(test_invalid_rows)

    train_valid_rows = train_df[train_df['Cleaned_Text'].str.split().str.len() > 4]
    additional_rows = train_valid_rows.sample(n=rows_to_move, random_state=random_state)

    test_df = pd.concat([test_df, additional_rows])

    train_df = train_df.drop(additional_rows.index)

    test_df = test_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    train_df = train_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return train_df, test_df

In [45]:
language_vectors = {}

train_df, test_df = prepare_train_test_split(df_filtered)

for lang in train_df['Language'].unique():
    vectors = np.array(train_df[train_df['Language'] == lang]['Hypervector'].tolist())
    language_vectors[lang] = np.mean(vectors, axis=0)

In [46]:
def predict_language(hypervector, language_vectors):
    similarities = {lang: cosine_similarity([hypervector], [vec])[0][0] for lang, vec in language_vectors.items()}
    return max(similarities, key=similarities.get)

In [47]:
start_time = time.time()

test_df['Predicted_Language'] = test_df['Hypervector'].apply(lambda x: predict_language(x, language_vectors))

end_time = time.time()
trigram_time = end_time - start_time
print(f"Time for prediction phase: {round(trigram_time, 4)} seconds")

Time for prediction phase: 22.1326 seconds


In [48]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_df['Language'], test_df['Predicted_Language'])
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.06%


In [50]:
mispredictions = test_df[test_df['Language'] != test_df['Predicted_Language']]

print(f"Wrong predictions \n")
print(mispredictions[['Text', 'Language', 'Predicted_Language']].head())

Wrong predictions 

                                                  Text  Language  \
9                   oye cálmate juntos otra frase que.   Spanish   
45             dina vänner jag känner mig mycket trög.  Sweedish   
57   Des associations de ce type sont présentes en ...    French   
321  [58] Three broad categories of anomaly detecti...   English   
378                           Jag håller 100% med dig.  Sweedish   

    Predicted_Language  
9           Portugeese  
45              Danish  
57               Dutch  
321             French  
378             Danish  


In [37]:
new_text = "Это простой русский текст"
cleaned_text = clean_text(new_text)
trigrams = extract_trigrams(cleaned_text)
hypervector = calculate_hypervector(trigrams)
predicted_language = predict_language(hypervector, language_vectors)

print(f"Predicted Language: {predicted_language}")

Predicted Language: Russian
