# South African Language Identification Hack 2023

© Explore Data Science Academy

## Honour Code

I **Tercius**, **Mapholo**, confirm - by submitting this document - that the solutions in this notebook are a result of my own work and that I abide by the EDSA honour code (https://drive.google.com/file/d/1QDCjGZJ8-FmJE3bZdIQNwnJyQKPhHZBn/view?usp=sharing).

Non-compliance with the honour code constitutes a material breach of contract.

In [21]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [15]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
df_test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [5]:
sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [11]:
# Tokenize the text using NLTK's word_tokenize
df_train['text'] = df_train['text'].apply(word_tokenize)
df_test['text'] = df_test['text'].apply(word_tokenize)

# Combine the tokens back into strings
df_train['text'] = df_train['text'].apply(lambda x: ' '.join(x))
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(x))

In [8]:
df_train['text'] = df_train['text'].str.replace('[^\w\s]', '').str.lower()
df_test['text'] = df_test['text'].str.replace('[^\w\s]', '').str.lower()

  df_train['text'] = df_train['text'].str.replace('[^\w\s]', '').str.lower()
  df_test['text'] = df_test['text'].str.replace('[^\w\s]', '').str.lower()


In [12]:
df_test.head()

Unnamed: 0,index,text
0,1,mmasepala fa maemo a a kgethegileng a letlelel...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta


In [13]:
# Feature Engineering
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df_train['text'])
y = df_train['lang_id']

In [14]:
# Train a Language Identification Model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Using Multinomial Naive Bayes as the language identification model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

MultinomialNB()

In [29]:
# Step 5: Model Evaluation
y_pred_nb = nb_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_nb)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9930


In [30]:
# Classification Report
print("Classification Report:")
print(classification_report(y_val, y_pred_nb))

Classification Report:
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.98      0.98      0.98       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      0.99      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.99      0.98      0.99       609
         zul       0.97      0.97      0.97       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



In [22]:
# Train a Language Identification Model (SVM)
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)
svm_classifier.fit(X_train, y_train)

SVC(kernel='linear', random_state=42)

In [23]:
# Model Evaluation (SVM)
y_pred_svm = svm_classifier.predict(X_val)
accuracy_svm = accuracy_score(y_val, y_pred_svm)
print("\nSupport Vector Machine (SVM):")
print(f"Validation Accuracy: {accuracy_svm:.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred_svm))


Support Vector Machine (SVM):
Validation Accuracy: 0.9876
Classification Report:
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.96      0.97      0.96       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       0.99      0.99      0.99       584
         tsn       1.00      0.99      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.97      0.96      0.97       609
         zul       0.95      0.96      0.95       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



In [24]:
# Train a Language Identification Model (Logistic Regression)
logreg_classifier = LogisticRegression(max_iter=1000, random_state=42)
logreg_classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [25]:
# Model Evaluation (Logistic Regression)
y_pred_logreg = logreg_classifier.predict(X_val)
accuracy_logreg = accuracy_score(y_val, y_pred_logreg)
print("\nLogistic Regression:")
print(f"Validation Accuracy: {accuracy_logreg:.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred_logreg))



Logistic Regression:
Validation Accuracy: 0.9895
Classification Report:
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.97      0.98      0.97       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       0.99      0.98      0.99       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.98      0.97      0.98       609
         zul       0.96      0.96      0.96       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



In [31]:
# Make Predictions on the Test Set (Using the best performing model - Naive Bayes in this case)
X_test = tfidf_vectorizer.transform(df_test['text'])
test_predictions = nb_classifier.predict(X_test)

In [32]:
# Prepare Submission File
submission_df = pd.DataFrame({'index': df_test['index'], 'lang_id': test_predictions})

In [33]:
submission_df.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr


In [34]:
# Save the Submission File
submission_df.to_csv('language_identification_submission.csv', index=False)