## Step 1 - Project Problem Statement

## Step 2 - Data Gathering

### 2.1 Import Library 

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("Language Detection.csv")
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [3]:
df.shape # shape of dataframe

(10337, 2)

In [4]:
df['Language'].unique() # Check unique quantities available in feature

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [5]:
df['Language'].nunique() # Check count of unique quantities available in feature

17

In [6]:
df['Language'].value_counts() # Values in Language feature

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

## Step 3 - Data Cleaning

In [7]:
df.duplicated(keep = "first").value_counts() # check out the duplicate value in dataframe

False    10271
True        66
dtype: int64

In [8]:
# It shows True= 66 it means that there are 66 duplicate rows are present in the dataset.

In [9]:
# drop duplicate rows

df.drop_duplicates(keep="first", inplace=True, ignore_index=True)

In [10]:
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10266,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10267,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10268,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10269,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


## Step 4 - EDA (Exploratory Data Analysis)

In [11]:
df.shape # shape of dataframe

(10271, 2)

In [12]:
# df.describe()
# Check out the discription of dataset,
# it shows the count, unique,top and frequency of all features separately

df.describe()

Unnamed: 0,Text,Language
count,10271,10271
unique,10267,17
top,slår mig.,English
freq,2,1382


In [13]:
# df.info()
# It gives the information of dataset
# It shows the non null count and datatypes of every feature

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10271 entries, 0 to 10270
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10271 non-null  object
 1   Language  10271 non-null  object
dtypes: object(2)
memory usage: 160.6+ KB


In [14]:
df.isna().sum() # check null value count of every feature

Text        0
Language    0
dtype: int64

In [15]:
# it clearly indicates that there is no null value in dataset.

## Data Splitting

### Label Encoding

In [16]:
encoder = LabelEncoder()
df['Language'] = encoder.fit_transform(df['Language'])
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",3
1,"""Nature"" can refer to the phenomena of the phy...",3
2,"The study of nature is a large, if not the onl...",3
3,"Although humans are part of nature, human acti...",3
4,[1] The word nature is borrowed from the Old F...,3
...,...,...
10266,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,9
10267,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,9
10268,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,9
10269,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,9


In [17]:
class_list = encoder.classes_
class_list

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [18]:
import pickle
encoding = pickle.dump(encoder,open('encoder.pkl','wb'))

In [19]:
# split the dataset into dependent and independent feature

x = df['Text']
y = df['Language']

In [20]:
x # independent feature

0         Nature, in the broadest sense, is the natural...
1        "Nature" can refer to the phenomena of the phy...
2        The study of nature is a large, if not the onl...
3        Although humans are part of nature, human acti...
4        [1] The word nature is borrowed from the Old F...
                               ...                        
10266    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10267    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10268    ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...
10269    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10270    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10271, dtype: object

In [21]:
y # dependent feature

0        3
1        3
2        3
3        3
4        3
        ..
10266    9
10267    9
10268    9
10269    9
10270    9
Name: Language, Length: 10271, dtype: int32

In [22]:
# Split the dependent as well as independent feature as a train and test data

############### 1. Word embedding for count vec
count_vec = CountVectorizer(analyzer="word")
count_vec_x = count_vec.fit_transform(x)
cv_x_train, cv_x_test, cv_y_train,cv_y_test = train_test_split(count_vec_x,y,random_state=30, test_size=0.25, stratify=y)
print(cv_x_train.shape,cv_x_test.shape,cv_y_train.shape,cv_y_test.shape)

############### 2. Word embedding for tfidf vec
tfidf_vec = TfidfVectorizer(analyzer='word', min_df=0.05)
tfidf_vec_x = tfidf_vec.fit_transform(x)
tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test = train_test_split(tfidf_vec_x,y,random_state=30, test_size=0.25, stratify=y)
print(tfidf_x_train.shape, tfidf_x_test.shape, tfidf_y_train.shape,tfidf_y_test.shape)

############### 3. Word embedding for tfidf ngram vec
tfidf_ngram_vec = TfidfVectorizer(analyzer='word', ngram_range=(2,3),   min_df=0.02)
tfidf_ngram_vec_x = tfidf_ngram_vec.fit_transform(x)
tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test = train_test_split(tfidf_ngram_vec_x,y,random_state=30, test_size=0.25, stratify=y)
print(tfngram_x_train.shape, tfngram_x_test.shape, tfngram_y_train.shape,tfngram_y_test.shape)

(7703, 39928) (2568, 39928) (7703,) (2568,)
(7703, 12) (2568, 12) (7703,) (2568,)
(7703, 1) (2568, 1) (7703,) (2568,)


## Step 5 - Model Training 

In [23]:
def train_model(model_name, x_train,x_test,y_train,y_test):
    """This function is for model trainingn"""    
    model_name.fit(x_train,y_train)   ### Model Training
    
    
    ############### model evaluation 
    
    ########### Test Data Evaluation 
    print('#'*50)
    print(f"TESTING DATA EVALUATION")
    y_pred_test = model_name.predict(x_test)
    acc_score = accuracy_score(y_test,y_pred_test)
    cnf_matrix = confusion_matrix(y_test,y_pred_test)
    clf_report = classification_report(y_test,y_pred_test)
    
    print(f"Accuracy_Score = {acc_score}")
    print(f"Confusion Matrix = \n{cnf_matrix}")
    print(f"Classification Report = \n{clf_report}")
    
    print('#'*50)
    print(f"TRAINING DATA EVALUATION")
    print()
    print()
    ########### training Data Evaluation 
    y_pred_train = model_name.predict(x_train)
    acc_score = accuracy_score(y_train,y_pred_train)
    cnf_matrix = confusion_matrix(y_train,y_pred_train)
    clf_report = classification_report(y_train,y_pred_train)
    
    print(f"Accuracy_Score = {acc_score}")
    print(f"Confusion Matrix = \n{cnf_matrix}")
    print(f"Classification Report = \n{clf_report}")
    
    return "Success"

## Step 6 - Model Evaluation

### 1. Logestic Regression

In [24]:
lgr_model = LogisticRegression(max_iter= 500)

train_model(lgr_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(lgr_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(lgr_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9524922118380063
Confusion Matrix = 
[[119   0   0   0   0   0   0   0   0   0   0   0  14   0   0   0   0]
 [  0  96   0   0   0   0   0   0   0   0   0   0   7   0   3   0   0]
 [  0   0 127   1   0   1   0   0   0   0   0   0   5   1   1   0   0]
 [  0   0   0 344   0   0   0   0   1   0   0   0   1   0   0   0   0]
 [  0   0   0   1 240   0   0   0   3   0   0   0   6   2   0   0   0]
 [  0   0   0   0   0 112   0   0   0   0   0   0   4   0   0   0   0]
 [  0   0   0   1   0   0  81   0   0   0   0   0   7   0   0   0   0]
 [  0   0   0   0   0   0   0  14   0   0   0   0   1   0   0   0   0]
 [  0   0   0   0   1   0   0   0 165   0   0   0   5   3   0   0   0]
 [  0   0   0   0   0   0   0   0   0  85   0   0   6   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 144   0   4   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0 175   4   4   0   0   0]
 [  0   0   0   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.14875389408099687
Confusion Matrix = 
[[  0   0   0 133   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 106   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 136   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 346   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 216  36   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 116   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  89   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  15   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 174   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  91   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 148   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 184   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 17

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

### 2. K Nearest Neighbour (KNN)

In [25]:
knn_model = KNeighborsClassifier()

train_model(knn_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(knn_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(knn_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.5346573208722741
Confusion Matrix = 
[[ 37   0   0   0   0   0   0   0   0  53   7   0  16   0   0  20   0]
 [  1  59   0   0   0   0   0   0   0  18   2   0   6   0   2  18   0]
 [  2   1  67   0   4   0   0   0   1  32   1   0   8   1   0  19   0]
 [  8   0   8 199   0   0   0   0   4  78  13   0   9   0   0  27   0]
 [  6   2   3   0 154   0   0   0   0  43   8   0   7  12   0  17   0]
 [  4   2   0   0   0  55   0   0   2  30   7   0   5   0   0  11   0]
 [  3   0   0   0   0   0  53   0   0  16   3   0   8   0   0   6   0]
 [  0   0   0   0   0   0   0   9   0   3   0   0   1   0   0   2   0]
 [  3   0   0   1   2   0   0   0  90  47   6   0   4   1   0  20   0]
 [  0   0   0   0   0   0   0   0   0  76   4   0   6   0   0   5   0]
 [  1   0   0   0   0   0   0   0   0  31 101   0   4   0   0  11   0]
 [  6   0   0   3   6   0   0   0   0  29   3 106   5   7   0  19   0]
 [  2   0   0   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.34890965732087226
Confusion Matrix = 
[[132   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 51  13   2   0   1   2   0   0   0   0   0   2   0   0  35   0   0]
 [ 56   6  38   8   1   0   0   0   1   0   0  11   0   1  14   0   0]
 [ 34   2   2 308   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 46   1   7   0 105   0   0   0  16   0   0  28   0  38  11   0   0]
 [ 74   0   0  19   0   2   0   0   0   0   0   0   0   0  21   0   0]
 [ 88   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 15   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 67   7   0  14  21   0   0   0  65   0   0   0   0   0   0   0   0]
 [ 91   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [147   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 46   0   0   1   2   0   0   0   0   0   0  95   0  40   0   0   0]
 [169   2   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 34   3   8   0  4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.3473971180059717
Confusion Matrix = 
[[395   3   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [133  46   2   1   1   0   0   0   0   0   0   6   0   0 129   0   0]
 [173   7 110  33   0   0   0   0   1   0   0  25   0   3  54   0   0]
 [137  14   1 879   0   0   0   0   5   0   0   0   0   0   0   0   0]
 [120   6  27   0 339   0   0   0  31   0   0  99   0 101  32   0   0]
 [251   0   0  38   0   6   0   0   0   0   0   0   0   0  54   0   0]
 [265   4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 43   1   0   3   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [206  27   0  56  43   0   0   0 187   0   0   0   0   0   1   0   0]
 [274   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [441   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [187   0   5   0   1   0   0   0   0   0   0 273   0  85   1   0   0]
 [505   5   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [122   2  14   0 110

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.059190031152647975
Confusion Matrix = 
[[  0   0   0   0   0 133   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 106   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 136   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 346   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  36 216   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 116   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  89   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  15   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 174   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  91   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 148   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 184   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 172   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

### 3. Decision Tree

In [26]:
dt_model = DecisionTreeClassifier(random_state=2)

train_model(dt_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(dt_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(dt_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.8714953271028038
Confusion Matrix = 
[[107   0   0   1   0   0   0   0   0  25   0   0   0   0   0   0   0]
 [  0  81   0   4   3   0   0   0   0  10   0   2   0   0   6   0   0]
 [  0   0 116   2   0   0   0   0   1   7   0   1   0   4   5   0   0]
 [  0   0   0 332   0   0   0   0   6   5   0   0   0   2   1   0   0]
 [  0   1   2   1 210   0   0   0  11  11   0   1   0  14   1   0   0]
 [  0   3   0   1   0 105   0   0   2   5   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0  76   0   0  12   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  13   0   2   0   0   0   0   0   0   0]
 [  0   0   0   3   5   0   0   0 150  10   0   0   0   6   0   0   0]
 [  0   0   0   0   0   0   0   0   0  91   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  14 134   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   9   0 158   0  14   1   0   1]
 [  0   0   0   2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

### 4. Random Forerst

In [27]:
rf_model = RandomForestClassifier()

train_model(rf_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(rf_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(rf_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.927570093457944
Confusion Matrix = 
[[110   0   0   1   0   0   0   0   0  22   0   0   0   0   0   0   0]
 [  0  90   0   1   0   0   0   0   0   7   0   1   0   0   7   0   0]
 [  0   1 125   1   1   0   0   0   0   6   0   0   0   1   1   0   0]
 [  0   0   0 342   0   0   0   0   1   3   0   0   0   0   0   0   0]
 [  0   0   0   1 237   0   0   0   3   7   0   1   0   3   0   0   0]
 [  0   1   0   0   0 110   0   0   0   5   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0  81   0   0   7   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   1   1   0   0   0 161   8   0   0   0   3   0   0   0]
 [  0   0   0   0   0   0   0   0   0  91   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9 139   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   5   0 174   0   4   0   0   0]
 [  0   0   0   1 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.39504089315850965
Confusion Matrix = 
[[  0   0   0   1   0   0   0   0   0   0   0   0 395   0   3   0   0]
 [  0 124   1   1   3   0   0   0   0   0   0   6 133   0  50   0   0]
 [  0   0 106  11  13   0   0   0  20   0   0  25 173   5  53   0   0]
 [  0   0   1 854   0   0   0   0  30   0   0   0 137   0  14   0   0]
 [  0  11   3   0 380   0   0   0  54   0   0 127 120  36  24   0   0]
 [  0  54   0   0   0   6   0   0  38   0   0   0 251   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 265   0   4   0   0]
 [  0   0   0   2   0   0   0   2   0   0   0   0  43   0   0   0   0]
 [  0   0   0   2   0   0   0   0 284   0   0   0 206   0  28   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 274   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 441   0   1   0   0]
 [  0   0   3   0   1   0   0   0   0   0   0 356 187   5   0   0   0]
 [  0   0   0   6   0   0   0   0   0   0   0   0 505   0   5   0   0]
 [  0   0   4   0  8

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.14760482928729066
Confusion Matrix = 
[[   0    0    0  399    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  318    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  406    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0 1036    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  654  101    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  349    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  269    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0   47    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  520    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  275    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  443    0    0    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

### 5. Support Vector Machine

In [28]:
svc_model = SVC()

train_model(svc_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(svc_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(svc_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.8808411214953271
Confusion Matrix = 
[[102   0   0   0   0   0   0   0   0   0   0   0  31   0   0   0   0]
 [  0  79   0   2   1   0   0   0   0   0   0   0  18   0   5   0   1]
 [  0   0 117   1   2   0   0   0   0   0   0   0  13   3   0   0   0]
 [  0   0   0 337   0   0   0   0   0   0   0   0   9   0   0   0   0]
 [  0   0   0   0 232   0   0   0   2   0   0   0  15   3   0   0   0]
 [  0   0   0   1   0 103   0   0   0   0   0   0  11   0   1   0   0]
 [  0   0   0   2   0   0  68   0   0   0   0   0  19   0   0   0   0]
 [  0   0   0   6   0   0   0   3   0   0   0   0   6   0   0   0   0]
 [  0   0   0   0   0   0   0   0 153   0   0   0  18   3   0   0   0]
 [  0   0   0   1   0   0   0   0   0  68   0   0  22   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 131   0  16   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0   0 165  13   4   0   0   1]
 [  0   0   0   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.3925233644859813
Confusion Matrix = 
[[  0   0   0   1   0   0   0   0   0   0   0   0 132   0   0   0   0]
 [  0  32   0   0   3   0   0   0   0   0   0   2  51   0  18   0   0]
 [  0   0  33   5   5   0   0   0   3   0   0  11  56   2  21   0   0]
 [  0   0   1 306   0   0   0   0   3   0   0   0  34   0   2   0   0]
 [  0   3   0   0 122   0   0   0  26   0   0  30  46  17   8   0   0]
 [  0  21   0   0   0   2   0   0  19   0   0   0  74   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0  88   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0  15   0   0   0   0]
 [  0   0   0   1   0   0   0   0  99   0   0   0  67   0   7   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0  91   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 147   0   1   0   0]
 [  0   0   0   1   1   0   0   0   0   0   0 134  46   2   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 169   0   2   0   0]
 [  0   0   1   0  61

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.380501103466182
Confusion Matrix = 
[[  0   0   0   1   0   0   0   0   0   0   0   0 395   0   3   0   0]
 [  0 123   2   1   3   0   0   0   0   0   0   6 133   0  50   0   0]
 [  0   0  95  13  19   0   0   0  20   0   0  26 173   7  53   0   0]
 [  0   0   1 853   0   0   0   0  31   0   0   0 137   0  14   0   0]
 [  0  12   1   0 367   0   0   0  55   0   0 129 120  45  26   0   0]
 [  0  54   0   0   0   6   0   0  38   0   0   0 251   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 265   0   4   0   0]
 [  0   1   0   3   0   0   0   0   0   0   0   0  43   0   0   0   0]
 [  0   0   0   1   0   0   0   0 285   0   0   0 206   0  28   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 274   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0 441   0   1   0   0]
 [  0   0   4   0   2   0   0   0   0   0   0 356 187   3   0   0   0]
 [  0   0   0   6   0   0   0   0   0   0   0   0 505   0   5   0   0]
 [  0   0   5   0 152 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy_Score = 0.14875389408099687
Confusion Matrix = 
[[  0   0   0 133   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 106   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 136   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 346   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 216  36   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 116   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  89   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  15   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 174   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0  91   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 148   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 184   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 172   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 182  2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

### 6. Naive Bayes 

In [29]:
nb_model = MultinomialNB()

train_model(nb_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)
train_model(nb_model,tfidf_x_train, tfidf_x_test, tfidf_y_train,tfidf_y_test)
train_model(nb_model,tfngram_x_train, tfngram_x_test, tfngram_y_train,tfngram_y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9719626168224299
Confusion Matrix = 
[[127   0   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  96   0   5   0   1   0   0   0   0   0   0   0   0   4   0   0]
 [  0   0 132   4   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 346   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   3 248   0   0   0   1   0   0   0   0   0   0   0   0]
 [  0   0   0   2   0 113   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   3   0   0  85   0   0   0   1   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2   1   0   0   0 168   0   0   0   0   3   0   0   0]
 [  0   0   0   4   0   0   0   0   0  87   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 147   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0 181   0   2   0   0   0]
 [  0   0   0   6

Accuracy_Score = 0.13449305465403089
Confusion Matrix = 
[[   0    0    0  399    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  318    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  406    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0 1036    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  755    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  349    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  269    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0   47    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  520    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  275    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [   0    0    0  443    0    0    0    0  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'Success'

## Final Model Selection

In [30]:
nb_model = MultinomialNB()

train_model(nb_model,cv_x_train, cv_x_test, cv_y_train,cv_y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9719626168224299
Confusion Matrix = 
[[127   0   0   6   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  96   0   5   0   1   0   0   0   0   0   0   0   0   4   0   0]
 [  0   0 132   4   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 346   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   3 248   0   0   0   1   0   0   0   0   0   0   0   0]
 [  0   0   0   2   0 113   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   3   0   0  85   0   0   0   1   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  15   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2   1   0   0   0 168   0   0   0   0   3   0   0   0]
 [  0   0   0   4   0   0   0   0   0  87   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 147   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0 181   0   2   0   0   0]
 [  0   0   0   6

'Success'

In [31]:
import pickle
model = pickle.dump(nb_model,open('model.pkl','wb'))

In [32]:
cv = pickle.dump(count_vec,open('count_vec.pkl','wb'))

## User Define Function

In [33]:
def prediction(article):
    text = ["".join(article)]
    user_count_vec = count_vec.transform(text)
    result = nb_model.predict(user_count_vec)
    class_list = encoder.classes_
    return (f"Text Language = {class_list[result[0]]}")

In [34]:
article = input("Enter Text == ")
prediction(article)

Enter Text == The Times of India is an Indian English-language daily newspaper and digital news media owned and managed by The Times Group. It is the third-largest newspaper in India by circulation and largest selling English-language daily in the world.


'Text Language = English'

In [35]:
# The Times of India is an Indian English-language daily newspaper and digital news media owned and managed by The Times Group. It is the third-largest newspaper in India by circulation and largest selling English-language daily in the world.

In [36]:
article = input("Enter Text == ")
prediction(article)

Enter Text == டுர்ஹாம்: இங்கிலாந்துக்கு சுற்றுப்பயணம் மேற்கொண்டுள்ள தென்னாப்பிரிக்க கிரிக்கெட் அணி மூன்று ஒரு நாள் போட்டி, மூன்று டி20 போட்டி மற்றும் 3 டெஸ்ட் போட்டி கொண்ட தொடரில் விளையாடுகிறது.


'Text Language = Tamil'

In [37]:
# டுர்ஹாம்: இங்கிலாந்துக்கு சுற்றுப்பயணம் மேற்கொண்டுள்ள தென்னாப்பிரிக்க கிரிக்கெட் அணி மூன்று ஒரு நாள் போட்டி, மூன்று டி20 போட்டி மற்றும் 3 டெஸ்ட் போட்டி கொண்ட தொடரில் விளையாடுகிறது.

In [38]:
article = input("Enter Text == ")
prediction(article)

Enter Text == ಬುಧವಾರವು ವಿಘ್ನ ವಿನಾಶಕ ಗಣೇಶನಿಗೆ ಮೀಸಲಾದ ದಿನ. ಇಂದು ಗಣೇಶನಿಗೆ ವಿಶೇಷ ಪೂಜೆ ಸಲ್ಲಿಸುವ ಮೂಲಕ ಅವರ ಕೃಪೆಗೆ ಪಾತ್ರರಾಗಬಹುದು. ಇಂದಿನ ದ್ವಾದಶ ರಾಶಿಗಳ ಫಲ ಹೇಗಿದೆ ತಿಳಿಯೋಣ


'Text Language = Kannada'

In [39]:
# ಬುಧವಾರವು ವಿಘ್ನ ವಿನಾಶಕ ಗಣೇಶನಿಗೆ ಮೀಸಲಾದ ದಿನ. ಇಂದು ಗಣೇಶನಿಗೆ ವಿಶೇಷ ಪೂಜೆ ಸಲ್ಲಿಸುವ ಮೂಲಕ ಅವರ ಕೃಪೆಗೆ ಪಾತ್ರರಾಗಬಹುದು. ಇಂದಿನ ದ್ವಾದಶ ರಾಶಿಗಳ ಫಲ ಹೇಗಿದೆ ತಿಳಿಯೋಣ

In [40]:
article = input("Enter Text == ")
prediction(article)

Enter Text == Les drames ont été provoqués par des chutes d’arbres et de toiture. Le ministre de l’intérieur, Gérald Darmanin, se rendra sur place dans l’après-midi.


'Text Language = French'

In [41]:
# Google Pinterest Digg Linkedin Reddit Stumbleu

In [47]:
article = input("Enter Text == ")
prediction(article)

Enter Text == Así se inundó el metro de París por las fuertes lluvias de tormenta


'Text Language = Spanish'

In [43]:
# Así se inundó el metro de París por las fuertes lluvias de tormenta

In [44]:
columns_dict = {"col_name": ["Text"]}
columns_dict

{'col_name': ['Text']}

In [45]:
import json

In [46]:
with open('columns_name.json','w') as json_file:
    json.dump(columns_dict,json_file)