In [32]:
# utilities
# Importing the Libraries 
import re  # for pattern matching and manipulation of strings.
import numpy as np  # used for numerical computations and data manipulation.
import pandas as pd  # for data manipulation and analysis.
import string  # for string manipulation tasks.

# Remove duplicates but keep the first occurence of the text
# For plotting
import seaborn as sns  # for data visualization.
from wordcloud import WordCloud  # used to generate word clouds.
import matplotlib.pyplot as plt  # used for plotting data.

# nltk
from nltk.stem import WordNetLemmatizer  # used for lemmatizing words
from nltk.tokenize import TreebankWordTokenizer  # used for tokenizing sentences into words.
from nltk import SnowballStemmer  # used for stemming words.

# sklearn
from sklearn.svm import LinearSVC  # used for solving linear classification problems. 
from sklearn.naive_bayes import BernoulliNB  # implementation of the Naive Bayes algorithm.
from sklearn.linear_model import LogisticRegression  # implementation of logistic regression.  
from sklearn.model_selection import train_test_split  # for splitting a dataset into training and testing subsets.
from sklearn.feature_extraction.text import CountVectorizer  # for converting text documents into a numerical representation.
from sklearn.metrics import confusion_matrix, classification_report  # for evaluating the performance of a classification model
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn import preprocessing  # for data preprocessing
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
import timeit
from sklearn.metrics import accuracy_score
# suppress cell warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("train_set.csv") # reads a CSV file named "train.csv" and stores it in Pandas df.
df_test = pd.read_csv("test_set.csv") # reads "test_with_no_labels.csv" and stores it in Pandas df_test.

In [4]:
df.head() #taking a view

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [5]:
df_test.head() #taking a look

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   5682 non-null   int64 
 1   text    5682 non-null   object
dtypes: int64(1), object(1)
memory usage: 88.9+ KB


In [8]:
np.sum(df.isnull().any(axis=1))

0

In [10]:
#Removed duplicates
#Check for duplicates in the data
df.duplicated().sum()

# Remove duplicates but keep the first occurence of the text

3052

In [11]:
dft = df.copy() #Making a copy of the data to avoid modifying the original

dft = df.drop_duplicates(keep='first')
dft.duplicated().sum()

0

In [28]:
np.sum(df_test.isnull().any(axis=1))

0

In [29]:
df_test.duplicated().sum()

0

In [30]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = re.sub(r'([^\s\w“”]|_)+', '', text) # Remove non-alphanumeric characters 
    text = ' '.join(text.split()) # Remove whitespace
    
    return text.strip()

df_test['clean_text'] = df_test['text'].apply(clean_text)
df_test.head()

Unnamed: 0,index,text,clean_text
0,1,"mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo a a kgethegileng a letlelel...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta.,winste op buitelandse valuta


In [12]:
dft.dtypes

lang_id    object
text       object
dtype: object

In [13]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = re.sub(r'([^\s\w“”]|_)+', '', text) # Remove non-alphanumeric characters 
    text = ' '.join(text.split()) # Remove whitespace
    
    return text.strip()

dft['clean_text'] = dft['text'].apply(clean_text)
dft.head()

Unnamed: 0,lang_id,text,clean_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [23]:
# Change to lowercase
dft['clean_text']=dft['clean_text'].str.lower()
df_test['text'] = df_test['text'].str.lower()
dft['clean_text'].tail()

32994    manuel marin s illfated debt sources but very ...
32995    popo ya dipolateforomo tse ke go tlisa boetele...
32997    closing date for the submission of completed t...
32998    nawuphina umntu ofunyenwe enetyala phantsi kwa...
32999    mafapha a mang le ona a lokela ho etsa ditlale...
Name: clean_text, dtype: object

In [24]:
#Create an instance of the count vectorizer 
vectorizer = CountVectorizer(ngram_range=(1,2))

#Fit the text data 
X = vectorizer.fit_transform(dft['clean_text'].values.astype(str))

#Label transformation
#Create an instance of the label encoder
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
le = label_encoder

# Fit the lang_id column to the label encoder and transform
y = le.fit_transform(dft['lang_id'])

# Get the unique list of label encoder types
labels = list(le.classes_)
labels

['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)

(23958, 715943)


In [26]:
#Initialize the logistic regression classifier
lr = LogisticRegression(max_iter=1000)

#Train the classifier
lr.fit(X_train, y_train)

#Make Predictions on the testing data
lr_pred = lr.predict(X_test)

In [34]:
# Calculate accuracy
accuracy = accuracy_score(y_test, lr_pred)

print("Linear regression Accuracy:", accuracy)
print('Classification Report')
print(classification_report(y_test, lr_pred, target_names=labels))


Linear regression Accuracy: 0.993322203672788
Classification Report
              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       553
         eng       1.00      1.00      1.00       603
         nbl       0.98      0.97      0.98       473
         nso       1.00      0.99      1.00       559
         sot       1.00      1.00      1.00       606
         ssw       0.99      0.99      0.99       481
         tsn       1.00      1.00      1.00       543
         tso       1.00      1.00      1.00       548
         ven       1.00      1.00      1.00       509
         xho       0.98      0.99      0.99       503
         zul       0.98      0.98      0.98       612

    accuracy                           0.99      5990
   macro avg       0.99      0.99      0.99      5990
weighted avg       0.99      0.99      0.99      5990



In [36]:
# Define the test features
x_test = df_test['clean_text']

# Transform
x_test = vectorizer.transform(df_test['clean_text'].values.astype(str))

pred = lr.predict(x_test)

# Transform the prediction back to text
pred_trans = le.inverse_transform(pred)

#Create dataframe and submission csv
DAF=pd.DataFrame(pred_trans,columns=['lang_id'])
output = pd.DataFrame({"index":df_test["index"]})
submission = output.join(DAF)
submission.to_csv("results_df_lr.csv",index=False)