In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DATA LOADING AND VISUALISATION

In [None]:
def open_file(filename):
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

In [None]:
data_raw = dict()
data_raw['Slovak'] = open_file('/kaggle/input/language-classification/train_sentences.sk')
data_raw['English'] = open_file('/kaggle/input/language-classification/train_sentences.en')
data_raw['Czech'] = open_file('/kaggle/input/language-classification/train_sentences.cs')

In [None]:
def show_stats(data):
    for lang, sentences in data.items():
        no_of_sent = len(sentences)
        word_list = ' '.join(sentences).split()
        no_of_words = len(word_list)
        no_of_unique = len(set(word_list))
        sample = ' '.join(sentences[0].split()[:7])#taking the first 7 words
        
        print(f"Language: {lang}")
        print("--------------------------")
        print(f"Number of sentences: {no_of_sent}")
        print(f"Number of words: {no_of_words}")
        print(f"Number of unique words: {no_of_unique}")
        print(f"Sample extract: {sample}\n")

In [None]:
show_stats(data_raw)

# DATA CLEANING AND PREPROCESSING

Some standard preprocessing steps for text data are:
1) Removing punctuations
2) Converting digits to equivalent words
3) Converting to lower case
4) General cleaning like removing tags and blank spaces

In [None]:
import string
def preprocess(text):
    p_text = text
    p_text = p_text.lower().replace('-', ' ')
    translation_table = str.maketrans('\n', ' ', string.punctuation+string.digits)
    p_text = p_text.translate(translation_table)
    return p_text
    

In [None]:
#A dictionary is created to match the format of raw data
p_data = {k: [preprocess(sentence) for sentence in v] for k,v in data_raw.items()}
show_stats(p_data)

# NAIVE BAYES MODEL

In [None]:
sent_train, y_train = [],[]

for k, v in p_data.items():
    for sent in v:
        sent_train.append(sent)
        y_train.append(k)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
X_train = vectorizer.fit_transform(sent_train)
X_train = X_train.toarray()

In [None]:
X_train

## MODEL CREATION

In [None]:
# Save the MultinomialNB model to a file
from sklearn.naive_bayes import MultinomialNB
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

with open("vectorizer.pkl", 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)



In [None]:
data_val = dict()
data_val['Slovak'] = open_file('/kaggle/input/language-classification/val_sentences.sk')
data_val['English'] = open_file('/kaggle/input/language-classification/val_sentences.en')
data_val['Czech'] = open_file('/kaggle/input/language-classification/val_sentences.cs')

In [None]:
p_val_data = {k: [preprocess(sentence) for sentence in v] for k,v in data_val.items()}

In [None]:
show_stats(p_val_data)

In [None]:
sent_val, y_val = [],[]

for k,v in p_val_data.items():
    for sent in v:
        sent_val.append(sent)
        y_val.append(k)

In [None]:
X_val = vectorizer.transform(sent_val)
X_val = X_val.toarray()

In [None]:
pred = NB_classifier.predict(X_val)


In [None]:
#import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_val, pred)
#s =sns.heatmap(cm, annot = True)
cmd_obj = ConfusionMatrixDisplay(cm, display_labels=['Czech', 'English', 'Slovak'])
cmd_obj.plot()



As Czech and Slovak are similar languages, there is a lot of confusion.

In [None]:
print(classification_report(y_val,pred))

# SIMPLE ADJUSTMENTS TO MODEL

alpha = Smoothing constant and fit_prior is set to False so that there is no bias due to unequal number of training data in each class

In [None]:
NB_classifier = MultinomialNB(alpha = 0.0001, fit_prior = False)
#alpha = Smoothing constant and fit_prior is set to False so that there is no bias due to unequal number of training data in each class
NB_classifier.fit(X_train, y_train)
pred = NB_classifier.predict(X_val)
cm = confusion_matrix(y_val, pred)
cmd_obj = ConfusionMatrixDisplay(cm, display_labels=['Czech', 'English', 'Slovak'])
cmd_obj.plot()
with open("naive_bayes_model.pkl", "wb") as model_file:
    pickle.dump(NB_classifier, model_file)


In [None]:
print(classification_report(y_val, pred))

There is a great improvement in f1 score. 