In [15]:
import pandas as pd
import numpy as np
import re
import warnings

In [16]:
## Importing classification models
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
data = pd.read_csv('data/Language Detection.csv')
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [18]:
# Separating the independent and dependent features
X = data['Text']
y = data['Language']

In [19]:
# Converting y into numerical values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Removing symobols numbers from the input text

text_list = []

for text in X:

    # Removing the symbols and numbers using regex
    text = re.sub(r'[!@#$(),n%^&*?:;~`0-9]', ' ', text)

    # Removing Square brackets
    text = text.replace("\\", "")
    text = text.replace("[", "")
    text = text.replace("]", "")

    # Converting it into lower order
    text = text.lower()

    # Appending the text into the new list
    text_list.append(text)
    

# Converting the input into numerical values using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(text_list).toarray()
X.shape

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
class Remove(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        text_list = []
        for text in X:
            text = re.sub(r'[!@#$(),n%^&*?:;~`0-9]]', ' ', text)
            text = text.replace("[", "")
            text = text.replace("]", "")
            text = text.lower()
            text_list.append(text)
        return text_list


class Vectorizer(BaseEstimator, TransformerMixin):
    def fit(self, X):
        return self
    
    def transform(self, X):
        cv = CountVectorizer()
        bow = cv.fit_transform(X).toarray()
        return bow


In [21]:
from sklearn.pipeline import Pipeline
input_pipeline = Pipeline([
    ("remover", Remove()),
    ("vectorizer", Vectorizer())
])

In [13]:
# Splitting into training and testing datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
x_train_arr = input_pipeline.fit_transform(x_train)
x_test_arr = input_pipeline.transform(x_test)
print(x_train_arr.shape)
print(x_test_arr.shape)

(8269, 34883)
(2068, 14061)


In [36]:
rem = Remove()
new_text_list = rem.fit_transform(X)
vec = Vectorizer()
array = vec.fit_transform(new_text_list)
print(array.shape)

(10337, 39993)


In [23]:
# Splitting into training and testing datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
x_train.shape

(8269, 34968)

In [25]:
x_test.shape

(2068, 34968)

In [9]:
# Evaluate model function
from sklearn.metrics import accuracy_score

def evaluate_model(true, pred):
    acc = accuracy_score(true, pred)
    return acc



In [12]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Support Vector Classifier": SVC(kernel='poly', random_state=0)
}

In [25]:
# # models = {
# #     "Logistic Regression": LogisticRegression(),
# #     "Guassian Naive Bayes": GaussianNB(),
# #     "Multinomial Naive Bayes": MultinomialNB(),
# #     "Bernoulli Naive Bayes": BernoulliNB()
# # }

# model_list=[]

# for i in range(len(list(models))):
#     model = list(models.values())[i]
#     model.fit(x_train, y_train)

#     # Predicting
#     y_train_pred = model.predict(x_train)
#     y_test_pred = model.predict(x_test)

#     # Evaluating train and testing data set
#     model_train_acc = evaluate_model(y_train_pred, y_train)

#     model_test_acc = evaluate_model(y_test_pred, y_test)

#     # Printing
#     print(list(models.keys())[i])
#     model_list.append(list(models.keys())[i])

#     print("Model Performance on training set")
#     print("Accuracy Score: {:.4f}".format(model_train_acc))

#     print("------------------------------------------")

#     print("Model Performance on testing set")
#     print("Accuracy Score: {:.4f}".format(model_test_acc))

#     print("------------------------------------------")




In [10]:
# Since multinomialNB is showing the best accuracy
model = MultinomialNB()
model.fit(x_train, y_train)

In [11]:
y_pred_test = model.predict(x_test)

In [12]:
accuracy_score(y_test, y_pred_test)

0.9782398452611218

In [20]:
def predict(text:str, model)->str:
    x = cv.transform([text]).toarray()
    lang = model.predict(x)
    lang = le.inverse_transform(lang)
    return lang[0]

In [22]:
predict("hello", model)

'English'

In [23]:
predict("नमस्ते, आप कैसे हैं", model)

'Hindi'

Creating a model pickle file

In [17]:
model_path = "model/model.pkl"

In [19]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [26]:
new_model = pickle.load(open(r'D:\Language\model\model.pkl', 'rb'))
print(predict("hello world", new_model))

English
