## Noluthando Mtshali - Hackerthon

In [1]:
#import the required packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle  # Import pickle library

In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('train_set.csv')

In [4]:
train

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [5]:
text_data = train['text'].to_list()

In [6]:
languages = train['lang_id'].to_list()

In [7]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(text_data, languages, test_size=0.2)


In [8]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [10]:
#These are the models we'll be using
models = [
    ("Logistic Regression", LogisticRegression()),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=3)),
    ("Decision Tree Classifier", DecisionTreeClassifier()),
]

In [11]:
#Evaluate the accuracy of the models
for name, model in models:
    model.fit(X_train_features, y_train)
    y_pred = model.predict(X_test_features)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

    # Save model as pickle file
    with open(f"{name}.pkl", "wb") as f:
        pickle.dump(model, f)

Logistic Regression Accuracy: 0.9947
K-Nearest Neighbors Accuracy: 0.9553
Decision Tree Classifier Accuracy: 0.9465


In [12]:
test_data = pd.read_csv('test_set.csv')

In [13]:
test_data

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [14]:
new_data = test_data['text'].to_list()

In [15]:
# Load the TF-IDF vectorizer
#vectorizer = TfidfVectorizer()  # Reload (optional, can keep the trained one)
new_data_features = vectorizer.transform(new_data)

In [16]:
# Loop through models, predict labels, and create CSV files for submision
for name, model in models:
    # Load the model (unchanged)
    with open(f"{name}.pkl", "rb") as f:
        loaded_model = pickle.load(f)

    # Predict labels for new data
    new_data_pred = loaded_model.predict(new_data_features)

    # Create a DataFrame with predicted language
    data_frame = pd.DataFrame({"lang_id": new_data_pred})

    # Set a new index starting from 1 using RangeIndex
    data_frame.index = pd.RangeIndex(1, len(data_frame) + 1)  # Start from 1, end at length+1

    # Rename index to 'ID'
    data_frame.rename(columns={"index": "index"}, inplace=True)

    # Save predictions as CSV file, include the 'ID' column
    data_frame.to_csv(f"{name}_predictions.csv", index=True)  # Set index=True to include

    print(f"{name} Predictions saved to {name}_predictions.csv")

Logistic Regression Predictions saved to Logistic Regression_predictions.csv
K-Nearest Neighbors Predictions saved to K-Nearest Neighbors_predictions.csv
Decision Tree Classifier Predictions saved to Decision Tree Classifier_predictions.csv
