In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load the training data
train_data = pd.read_csv("train_set.csv")
test_data = pd.read_csv("test_set.csv")

# Display the first few rows of the dataset
print(train_data.head())


  lang_id                                               text
0     xho  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1     xho  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2     eng  the province of kwazulu-natal department of tr...
3     nso  o netefatša gore o ba file dilo ka moka tše le...
4     ven  khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [3]:

# Data cleaning and preprocessing
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Handle mixed languages by separating them with spaces
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    
    return text

In [4]:
# Apply clean_text function to training data
train_data['cleaned_text'] = train_data['text'].apply(clean_text)

#Apply clean_text function to testing data
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

In [5]:
train_data.head()

Unnamed: 0,lang_id,text,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [8]:
label_encoder = LabelEncoder()
train_data['LanguageIdEncoded'] = label_encoder.fit_transform(train_data['lang_id'])

# Split the data into training and validation sets
X_train, X_val, y_train_encoded, y_val_encoded = train_test_split( train_data['cleaned_text'], train_data['LanguageIdEncoded'], test_size=0.2, random_state=42)

In [17]:
# Define a function to train and evaluate a model
def train_and_evaluate_model(model, X_train, y_train_encoded, X_val, y_val_encoded):
    # Create a pipeline with a text feature extraction and a classifier
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', model)
    ])

    # Train the model
    text_clf.fit(X_train, y_train_encoded)

    # Predictions
    y_pred_encoded = text_clf.predict(X_val)

    # Decode predictions for better interpretation
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    y_val = label_encoder.inverse_transform(y_val_encoded)

    # Evaluate the model
    f1 = f1_score(y_val, y_pred, average='weighted')
    print("F1 Score:", f1)
    print("\nClassification Report:\n", classification_report(y_val, y_pred))

# Train and evaluate different models
models = [
    MultinomialNB(),
    SVC(),
    RandomForestClassifier(),
    XGBClassifier()
]

for model in models:
    print(f"\nTraining and evaluating {model.__class__.__name__}...")
    train_and_evaluate_model(model, X_train, y_train_encoded, X_val, y_val_encoded)


Training and evaluating MultinomialNB...
F1 Score: 0.9984848537376715

Classification Report:
               precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      1.00      1.00       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       1.00      0.99      0.99       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600


Training and evaluating SVC...
F1 Score: 0.995461159619058

Classification Report:
       

In [None]:
# Choose the best model for submission
best_model = MultinomialNB()  # Replace with the best-performing model

# Train the best model on the entire dataset
best_model.fit(train_data['cleaned_text'], train_data['LanguageIdEncoded'])

# Load the test set
test_data = pd.read_csv("test_set.csv")

# Apply the same cleaning steps to the test set
test_data['cleaned_text'] = test_data['text'].apply(clean_text)

# Predictions on the test set
test_predictions_encoded = best_model.predict(test_data['cleaned_text'])

# Decode the predictions for submission
test_predictions_labels = label_encoder.inverse_transform(test_predictions_encoded)



In [None]:

# Create a submission dataframe
submission_df = pd.DataFrame({'index': test_data['index'], 'lang_id': test_predictions_labels})

# Save the submission file
submission_df.to_csv("submission.csv", index=False)