## Importing libraries and dataset
So let’s get started. First of all, we will import all the required libraries.

In [18]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")

# Preprocessing
from collections import Counter
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet  
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Building classification models
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

###Import datasets

In [19]:
train_data = pd.read_csv(r'C:\Users\HP\Downloads\language-identification\train_set.csv')
test_data = pd.read_csv(r'C:\Users\HP\Downloads\language-identification\test_set.csv')

#### counting the value count for each language

In [20]:
train_data["lang_id"].value_counts()

sot    3000
nbl    3000
xho    3000
tsn    3000
zul    3000
afr    3000
nso    3000
ven    3000
ssw    3000
tso    3000
eng    3000
Name: lang_id, dtype: int64

#### Separating Independent and Dependent features

In [21]:
X = train_data["text"]
y = train_data["lang_id"]

#### Text Preprocessing

In [22]:
def clean_text(df):
  item = 0
  for txt in df['text']:
    txt = txt .lower()
    txt = re.sub(r'[^\w\s]', '', txt)
    txt = re.sub(r'\d+', '', txt)
    txt = re.sub(r'\s\s+', ' ', txt)
    txt = txt.lstrip(' ')  
    txt = txt.lstrip()
    txt = txt.rstrip()
    txt = txt.replace('  ', ' ')
    df.loc[item, 'text'] = txt
    item += 1

In [23]:
clean_text(train_data)

In [24]:
clean_text(test_data)

#### Train/Test Splitting

In [25]:
# Seperate features and tagret variables
X = train_data['text']
y = train_data['lang_id']

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

#### Model Training

In [27]:
clfs = [RandomForestClassifier(max_depth=5, n_estimators=100), KNeighborsClassifier(n_neighbors=5, 
                                             metric='minkowski', 
                                             p=2),
              MultinomialNB(), LinearSVC(class_weight='balanced')]

In [28]:
def trainer(clfs, x_train, x_test, y_train, y_test):
    score_results = {}
    # Pipeline to balance the classses and then to build the model
    for clf in clfs:
        piper = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(2, 3))),
                              ('clf', clf)])
        
        piper.fit(x_train, y_train)
        predictions = piper.predict(x_test)

        # Output for each model
        score_results[clf.__class__.__name__] = {
            'F1-Macro': metrics.f1_score(y_test,
                                         predictions,
                                         average='macro'),
            'F1-Accuracy': metrics.f1_score(y_test, predictions,
                                            average='micro'),
            'F1-Weighted': metrics.f1_score(y_test,
                                            predictions,
                                            average='weighted')}

    return pd.DataFrame.from_dict(score_results, orient='index')

##Model performance

In [29]:
clfs_df = trainer(clfs, x_train, x_test, y_train, y_test)
cfl_df = clfs_df.sort_values('F1-Macro', ascending=False)
cfl_df

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
LinearSVC,0.960247,0.959545,0.960155
MultinomialNB,0.956147,0.956667,0.956705
KNeighborsClassifier,0.923102,0.921212,0.92291
RandomForestClassifier,0.56648,0.572273,0.56498


#### Predict the output and submission

In [30]:
mnb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
classifier = mnb.fit(x_train, y_train)
y_pred = classifier.predict(test_data['text'])

submission=pd.DataFrame(data={'index':test_data['index'],'lang_id':y_pred})
submission.to_csv('Invincible_guy_Classification_Hackathon',index=False)

In [31]:
submission.head(20)

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
5,6,nso
6,7,eng
7,8,sot
8,9,zul
9,10,eng
