In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
df=pd.read_csv("Language Detection.csv")
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [4]:
# 1. Handle missing values
df = df.dropna() 

In [8]:
df = df.drop_duplicates()

In [10]:
if 'text' in df.columns:
    df['text'] = df['text'].str.replace(r'\W+', ' ', regex=True)  # Remove non-alphanumeric characters
    df['text'] = df['text'].str.strip()  # Remove leading and trailing whitespace

# 4. Convert every text to lowercase
    df['text'] = df['text'].str.lower()

In [12]:
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [14]:
vectorizer_bow = CountVectorizer()
bow_features = vectorizer_bow.fit_transform(df["Text"])
print("Bag of Words (BoW):")
print(bow_features.toarray())

Bag of Words (BoW):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
label_encoder = LabelEncoder()
df["Language_Encoded"] = label_encoder.fit_transform(df["Language"])
print("\nLabel Encoded Languages:")
print(df[["Language", "Language_Encoded"]])


Label Encoded Languages:
      Language  Language_Encoded
0      English                 3
1      English                 3
2      English                 3
3      English                 3
4      English                 3
...        ...               ...
10332  Kannada                 9
10333  Kannada                 9
10334  Kannada                 9
10335  Kannada                 9
10336  Kannada                 9

[10271 rows x 2 columns]


In [18]:
x=df["Text"]
y=df["Language"]

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [24]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Naive Bayes": MultinomialNB()
}

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Assuming 'x_train' and 'x_test' are still text
# Apply CountVectorizer to convert text into numerical data
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)  # Fit and transform the training data
x_test_vectorized = vectorizer.transform(x_test)  # Only transform the test data

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training and evaluating: {model_name}")
    # Fit the model on the vectorized training data
    model.fit(x_train_vectorized, y_train)
    
    # Predict on the vectorized test data
    y_pred = model.predict(x_test_vectorized)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {model_name}: {accuracy * 100:.2f}%")
    print(f"Classification Report for {model_name}:\n")
    print(classification_report(y_test, y_pred))
    print("-" * 80)


Training and evaluating: Random Forest
Accuracy for Random Forest: 91.94%
Classification Report for Random Forest:

              precision    recall  f1-score   support

      Arabic       1.00      0.86      0.93       140
      Danish       0.96      0.91      0.93        99
       Dutch       0.99      0.92      0.95       131
     English       0.98      0.99      0.98       344
      French       0.97      0.94      0.95       247
      German       0.98      0.96      0.97       119
       Greek       1.00      0.88      0.94        95
       Hindi       1.00      0.92      0.96        13
     Italian       0.96      0.93      0.95       174
     Kannada       0.34      1.00      0.51        86
   Malayalam       1.00      0.92      0.96       141
  Portugeese       0.99      0.91      0.95       209
     Russian       1.00      0.84      0.92       166
     Spanish       0.94      0.90      0.92       217
    Sweedish       0.99      0.94      0.96       171
       Tamil       