In [20]:
import pandas as pd # for data analysis and associated manipulation of tabular data in DataFrames.
import numpy as np #for performing mathematical operations on arrays.
import statistics # for analysation and visualisation of data to find unseen patterns.
from numpy import set_printoptions
import datetime#for extracting new features that can be added to the other features of the dataset.
from dateutil.parser import parse
from datetime import datetime
import matplotlib.pyplot as plt # creating static and interactive visualisations.
import seaborn as sns#for making statistical graphics.
from matplotlib import rc
import statsmodels.api as sm
%config InlineBackend.figure_format = 'retina' # for plotting figures and avoiding blurry images
sns.set_context('notebook') # Larger scale for plots in notebooks
## from skimpy import skim

# Plotting Pretty figures and avoiding blurry images
%config InlineBackend.figure_format = 'retina'

# Larger scale for plots in notebooks
sns.set_context('notebook')

# Libraries for data preparation and model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import math
from statsmodels.graphics.correlation import plot_corr
import statsmodels.formula.api as sm
from statsmodels.formula.api import ols
from scipy.stats import pearsonr
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import metrics

# Setting global constants to ensure notebook results are reproducible
#PARAMETER_CONSTANT = ###

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud

# 2. Loading the Data

In [21]:
# train data
df_train = pd.read_csv('https://raw.githubusercontent.com/TebogoMngoma/South-African-Language-Identification-Hack-2022/main/train_set.csv')
#test data
df_test = pd.read_csv('https://raw.githubusercontent.com/TebogoMngoma/South-African-Language-Identification-Hack-2022/main/test_set.csv')

# 3. Exploratory Data Analysis (EDA)


In [22]:
#train dataset
df_train

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [23]:
#test dataset
df_test

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
...,...,...
5677,5678,You mark your ballot in private.
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ..."
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ..."


In [24]:
#concatenate datasets
df_train = df_train.join(df_test['index'])

In [25]:
df_train

Unnamed: 0,lang_id,text,index
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,1.0
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,2.0
2,eng,the province of kwazulu-natal department of tr...,3.0
3,nso,o netefatša gore o ba file dilo ka moka tše le...,4.0
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,5.0
...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,
32997,eng,closing date for the submission of completed t...,
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,


In [26]:
train_df = df_train.dropna()

In [27]:
train_df

Unnamed: 0,lang_id,text,index
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,1.0
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,2.0
2,eng,the province of kwazulu-natal department of tr...,3.0
3,nso,o netefatša gore o ba file dilo ka moka tše le...,4.0
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,5.0
...,...,...,...
5677,nbl,umthethosisekelo ugunyaza itjhuguluko ebujamen...,5678.0
5678,sot,dikhampani kapa kgwebo tsa motho ya mong di lo...,5679.0
5679,sot,dingaka di dumela hore motho ya tsubang dithet...,5680.0
5680,nso,tumelo yeo e fiwago ka ntle ga afrika borwa e ...,5681.0


### Feature Engineering

In [28]:
#split the dataset 
x = np.array(train_df["text"]) #X is our feature
y = np.array(train_df["lang_id"]) #y is our label

In [29]:
#split and fit x into countvectorizer to make values numerical
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=42)

In [30]:
#import Logistic Regression model
from sklearn.linear_model import LogisticRegression

In [31]:
lr = LogisticRegression(multi_class='ovr')

In [32]:
lr.fit(X_train,y_train)

LogisticRegression(multi_class='ovr')

In [33]:
y_pred_test = lr.predict(X_test)

In [34]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))

import warnings
warnings.filterwarnings('ignore')

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        97
         eng       1.00      1.00      1.00        96
         nbl       0.93      0.97      0.95       108
         nso       1.00      1.00      1.00        84
         sot       1.00      1.00      1.00       110
         ssw       0.99      0.97      0.98       107
         tsn       1.00      1.00      1.00       102
         tso       1.00      1.00      1.00       116
         ven       1.00      0.99      1.00       104
         xho       0.96      0.99      0.97        97
         zul       0.95      0.91      0.93       116

    accuracy                           0.98      1137
   macro avg       0.98      0.99      0.99      1137
weighted avg       0.98      0.98      0.98      1137



In [35]:
# convert index to int values
train_df["index"] = train_df["index"].astype(int)

In [39]:
#kaggle submission
submission_lrm = pd.DataFrame()
submission_lrm['index'] = train_df['index']
submission_lrm['lang_id'] = train_df['lang_id']
submission_lrm.to_csv('lrm_submit.csv', index=False)


In [42]:
with open('lrm_submit.csv') as f:
    for _ in range(10): # first 10 lines
        print(f.readline())

index,lang_id

1,xho

2,xho

3,eng

4,nso

5,ven

6,nso

7,tsn

8,ven

9,nso

