<a href="https://colab.research.google.com/github/Sankalp-Bisht/Language-classifier/blob/master/Language_Predictor_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pyplot as mpl 
import numpy as np
import string

from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import plot_confusion_matrix
import joblib
import pickle as pkl

In [0]:
def open_file(filename):
  with open(filename ,r) as f:
    data=f.readlines()
    return data

In [0]:
data_raw = dict()
data_raw["English"] = open(r"English2.txt").readlines()
data_raw["Spanish"] = open(r"Spanish2.txt").readlines()
data_raw["French"] = open(r"french2.txt").readlines()

In [0]:
def showstats(lang):
  for language, sentences in lang.items():

    number_of_sentences=0
    number_of_words=0
    number_of_unique_words=0
    sample_extract= ''

    word_list = ' '.join(sentences).split()
        
    number_of_sentences=len(sentences)
    number_of_words=len(word_list)
    number_of_unique_world= len(set(word_list))
    sample_extract=' '.join(sentences[0].split()[0:7])
    word_list = ' '.join(sentences).split()

    word_list = ' '.join(sentences).split()
    print(f'Language: {language}')
    print('-----------------------')
    print(f'Number of sentences\t:\t {number_of_sentences}')
    print(f'Number of words\t\t:\t {number_of_words}')
    print(f'Number of unique words\t:\t {number_of_unique_words}')
    print(f'Sample extract\t\t:\t {sample_extract}...\n')

    

In [113]:
showstats(data_raw)

Language: English
-----------------------
Number of sentences	:	 401
Number of words		:	 11333
Number of unique words	:	 0
Sample extract		:	 The English Wikipedia is the English-language edition...

Language: Spanish
-----------------------
Number of sentences	:	 498
Number of words		:	 9644
Number of unique words	:	 0
Sample extract		:	 El príncipe Alberto Víctor, duque de Clarence...

Language: French
-----------------------
Number of sentences	:	 200
Number of words		:	 6402
Number of unique words	:	 0
Sample extract		:	 Hilda Rix Nicholas, née le 1er septembre...



In [0]:
def preprocess(text):
    preprocessed_text = text.lower().replace('-', ' ')
    
    translation_table = str.maketrans('\n', ' ', string.punctuation + string.digits)
    
    preprocessed_text = preprocessed_text.translate(translation_table)
        
    return preprocessed_text

In [0]:
data_preprocessed = {k: [preprocess(sentences) for sentences in v] for k , v in data_raw.items()}

In [116]:
showstats(data_preprocessed)

Language: English
-----------------------
Number of sentences	:	 401
Number of words		:	 10807
Number of unique words	:	 0
Sample extract		:	 the english wikipedia is the english language...

Language: Spanish
-----------------------
Number of sentences	:	 498
Number of words		:	 9454
Number of unique words	:	 0
Sample extract		:	 el príncipe alberto víctor duque de clarence...

Language: French
-----------------------
Number of sentences	:	 200
Number of words		:	 6065
Number of unique words	:	 0
Sample extract		:	 hilda rix nicholas née le er septembre...



In [0]:
sentences_train , y_train = [], []

for k, v in data_preprocessed.items():
  for sentences in v:
    sentences_train.append(sentences)
    y_train.append(k)

In [0]:
vectorizer = CountVectorizer()

In [0]:
X_train = vectorizer.fit_transform(sentences_train)

In [120]:
NC = MultinomialNB(alpha=0.01, fit_prior=True)
NC.fit(X_train, y_train)


MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [0]:
data_val = dict()
data_val['Spanish'] = open(r"Spanishtest.txt").readlines()
data_val['French'] = open(r"Frenchtest.txt").readlines()
data_val['English'] = open(r"Englishtest.txt").readlines()

data_val_preprocessed = {k: [preprocess(sentence) for sentence in v] for k, v in data_val.items()}

In [0]:
sentences_val, y_val = [], []

for k, v in data_val_preprocessed.items():
    for sentence in v:
        sentences_val.append(sentence)
        y_val.append(k)

In [0]:
X_val = vectorizer.transform(sentences_val)

In [0]:
predictions = NC.predict(X_val)

In [125]:
f1_score(y_val, predictions, average='weighted')

0.8993366420397274

**Testing the model against few sentences**

In [136]:
sentence_1 = "Thankyou for visiting the project"
sentence_1 = [preprocess(sentence_1)]
text_vectorized = vectorizer.transform(sentence_1)
Predict = NC.predict(text_vectorized)
Predict[0]

'English'

In [137]:
sentence_2 = "Gracias por visitar el proyecto"
sentence_2 = [preprocess(sentence_2)]
text_vectorized = vectorizer.transform(sentence_2)
Predict = NC.predict(text_vectorized)
Predict[0]

'Spanish'

In [138]:
sentence_3 = "Merci d'avoir visité le projet"
sentence_3 = [preprocess(sentence_3)]
text_vectorized = vectorizer.transform(sentence_3)
Predict = NC.predict(text_vectorized)
Predict[0]

'French'