# Language Identification in South African Text: Kaggle Competition

This notebook presents my approach to tackle the Language Identification Challenge on Kaggle. The challenge focuses on classifying text written in South Africa's 11 Official languages. The notebook covers data exploration, preprocessing, feature extraction, model training, evaluation, and submission generation. By leveraging machine learning techniques, I aim to develop a classification model that accurately predicts the language of a given text.

## Importing necessary libraries

In [60]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB


## Loading the data

In [61]:
import pandas as pd
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

In [62]:
import pandas as pd

# Specify the full path or adjust the path accordingly
train_file_path = 'train_set.csv'
test_file_path = 'test_set.csv'

try:
    train_df = pd.read_csv(train_file_path)
    test_df = pd.read_csv(test_file_path)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"File not found. Please check the file paths: {train_file_path} and {test_file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


Data loaded successfully.


## Exploratory Data Analysis (EDA)

In [63]:
print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())


Train Dataset:
  lang_id                                               text
0     xho  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1     xho  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2     eng  the province of kwazulu-natal department of tr...
3     nso  o netefatša gore o ba file dilo ka moka tše le...
4     ven  khomishini ya ndinganyiso ya mbeu yo ewa maana...

Test Dataset:
   index                                               text
0      1  Mmasepala, fa maemo a a kgethegileng a letlele...
1      2  Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2      3         Tshivhumbeo tshi fana na ngano dza vhathu.
3      4  Kube inja nelikati betingevakala kutsi titsini...
4      5                      Winste op buitelandse valuta.


## Data Preprocessing

In [64]:
def preprocess_data(train_df, test_df):
    # Initializing the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fitting the vectorizer on the training data
    vectorizer.fit(train_df['text'])

    # Transforming the training and test data using the fitted vectorizer
    train_features = vectorizer.transform(train_df['text'])
    test_features = vectorizer.transform(test_df['text'])

    return train_features, test_features, vectorizer



## Preprocessing the data

In [81]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

def preprocess_data(train_df, test_df):
    # Assuming your DataFrame has a 'text' column containing the text data
    train_text = train_df['text'].astype(str)
    test_text = test_df['text'].astype(str)

    # Use CountVectorizer for text representation
    vectorizer = CountVectorizer()
    train_features = vectorizer.fit_transform(train_text)
    test_features = vectorizer.transform(test_text)

    return train_features, test_features, vectorizer

# Example usage:
# Replace 'train_set.csv' and 'test_set.csv' with your actual file names or paths
train_df = pd.read_csv('train_set.csv')
test_df = pd.read_csv('test_set.csv')

train_features, test_features, vectorizer = preprocess_data(train_df, test_df)


### Logistic Regression

In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)

# Initialize and train a logistic regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Make predictions on the validation set
lr_preds = lr_model.predict(X_val)

# Calculate the F1 score
lr_f1 = f1_score(y_val, lr_preds, average='weighted')

print("Logistic Regression F1 Score:", lr_f1)



Logistic Regression F1 Score: 0.9939385509753286


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### K Nearest Neighbors (KNN)

In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)

# Initialize and train a k-Nearest Neighbors model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Make predictions on the validation set
knn_preds = knn_model.predict(X_val)

# Calculate the F1 score
knn_f1 = f1_score(y_val, knn_preds, average='weighted')

print("KNN F1 Score:", knn_f1)



KNN F1 Score: 0.9175947192684192


### Support Vector Machine

In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)

# Initialize and train a Support Vector Machine (SVM) model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Make predictions on the validation set
svm_predictions = svm_model.predict(X_val)

# Calculate the F1 score
svm_f1 = f1_score(y_val, svm_predictions, average='weighted')

print("SVM F1 Score:", svm_f1)


SVM F1 Score: 0.9915204291200247


### Naive Bayes

In [95]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_val)
nb_f1 = f1_score(y_val, nb_predictions, average='weighted')
print("Naive Bayes F1 Score:", nb_f1)


Naive Bayes F1 Score: 0.9989392771541917


In [96]:

# Splitting the data for model training
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Generating predictions on the validation set
nb_predictions_val = nb_model.predict(X_val)
nb_f1_val = f1_score(y_val, nb_predictions_val, average='weighted')
print("Naive Bayes F1 Score on Validation Set:", nb_f1_val)


Naive Bayes F1 Score on Validation Set: 0.9989392771541917


## Generate predictions on the test set

In [97]:
# Generating predictions on the test set using the Naive Bayes model
X_test = vectorizer.transform(test_df['text'])
test_predictions = nb_model.predict(X_test)

# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

## Creating a csv for submission

In [94]:
# Save the submission CSV
submission_df.to_csv('FinalSub1.csv', index=False)