In [1]:
import pandas as pd
import numpy as np
import re
import warnings

In [2]:
## Importing classification models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [3]:
data = pd.read_csv('Language Detection.csv')
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
# Separating the independent and dependent features
X = data['Text']
y = data['Language']

In [5]:
# Converting y into numerical values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
# Removing symobols numbers from the input text

text_list = []

for text in X:

    # Removing the symbols and numbers using regex
    text = re.sub(r'[!@#$(),n%^&*?:;~`0-9]', ' ', text)

    # Removing Square brackets
    text = re.sub(r'[[]]', ' ', text)

    # Converting it into lower order
    text = text.lower()

    # Appending the text into the new list
    text_list.append(text)
    

  text = re.sub(r'[[]]', ' ', text)


In [7]:
# Converting the input into numerical values using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(text_list).toarray()
X.shape

(10337, 34937)

In [8]:
# Splitting into training and testing datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
# Evaluate model function
from sklearn.metrics import accuracy_score

def evaluate_model(true, pred):
    acc = accuracy_score(true, pred)
    return acc



In [10]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Gaussian Naive Bayes": GaussianNB()
}

In [13]:
# models = {
#     "Logistic Regression": LogisticRegression(),
#     "Guassian Naive Bayes": GaussianNB(),
#     "Multinomial Naive Bayes": MultinomialNB(),
#     "Bernoulli Naive Bayes": BernoulliNB()
# }

model_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    # Predicting
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    # Evaluating train and testing data set
    model_train_acc = evaluate_model(y_train_pred, y_train)

    model_test_acc = evaluate_model(y_test_pred, y_test)

    # Printing
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance on training set")
    print("Accuracy Score: {:.4f}".format(model_train_acc))

    print("------------------------------------------")

    print("Model Performance on testing set")
    print("Accuracy Score: {:.4f}".format(model_test_acc))

    print("------------------------------------------")




Logistic Regression
Model Performance on training set
Accuracy Score: 0.9959
------------------------------------------
Model Performance on testing set
Accuracy Score: 0.9507
Multinomial Naive Bayes
Model Performance on training set
Accuracy Score: 0.9892
------------------------------------------
Model Performance on testing set
Accuracy Score: 0.9700
