<a href="https://colab.research.google.com/github/RachanaGusain/PahariLI/blob/main/PLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Identification

## Import Libraries

In [None]:
!pip uninstall scikit-learn -y
!pip install -U scikit-learn

In [None]:
import os
import regex as re
import numpy as np
import pandas as pd

from time import time
from scipy import sparse
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE, MDS
from collections import namedtuple, defaultdict, Counter, OrderedDict
from itertools import tee, islice, accumulate, combinations
from tabulate import tabulate

In [None]:
import matplotlib
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
matplotlib.use("pgf")

plt.rcParams.update({
    "pgf.texsystem": 'pdflatex',
    "font.family": 'serif',  # use serif/main font for text elements
    "text.usetex": True,     # use inline math for ticks
    "pgf.rcfonts": False,    # don't setup fonts from rc parameters
    "font.size": 8,          # Use 8pt font in plots, to match 10pt font in document
    "axes.titlesize": 8,
    "axes.labelsize": 8,
    "xtick.labelsize": 6,    # Make the legend/label fonts a little smaller
    "ytick.labelsize": 6,
    "xtick.major.size": 0,
    "ytick.major.size": 0,
    "xtick.major.width": 0.2,
    "ytick.major.width": 0.2,
    "xtick.minor.size" : 1.5,
    "xtick.minor.width": 0.2,
    "xtick.direction": 'in',
    "lines.markersize": 1.2,
    "lines.linewidth": 0.5,
    "hatch.linewidth": 0.4,
    "patch.linewidth": 0.2,
    "axes.prop_cycle": matplotlib.cycler('color', 'k'),
    "hatch.color": 'k',
    "axes.linewidth": 0.2,
    "grid.linewidth": 0.2,
    "legend.fontsize": 6,
    "legend.title_fontsize": 6,
    "legend.labelspacing": 0.1,
    "legend.handlelength": 3,
    "legend.frameon": False,
    "savefig.dpi": 1000,
    "savefig.bbox": 'tight',
    "savefig.format": 'pdf'
    })

%matplotlib inline

In [None]:
! sudo apt-get install texlive-latex-recommended 
! sudo apt install texlive-latex-extra
! sudo apt install dvipng
! sudo apt install cm-super
#!apt install texlive-fonts-recommended texlive-fonts-extra cm-super dvipng

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
num_clf = 2
clflist = ["mnb", "svm"]

## Load Data

In [None]:
# Data files
tr_file = "/content/drive/MyDrive/PLI/data/train.txt"
ts_file = "/content/drive/MyDrive/PLI/data/test.txt"

# Load data
tr_data = open(tr_file, mode='r', encoding='utf-8')
ts_data = open(ts_file, mode='r', encoding='utf-8')

print("Data loaded.")

In [None]:
# Separate text and labels
tr_text = []
tr_lang = []
ts_text = []
ts_lang = []

for line in tr_data:
    text, lang = line.strip().split('\t')
    tr_text.append(text)
    tr_lang.append(lang)

for line in ts_data:
    text, lang = line.strip().split('\t')
    ts_text.append(text)
    ts_lang.append(lang)

In [None]:
label = {'dgo': 0, 'gbm': 1, 'kfy': 2, 'npi': 3}

y_tr = np.asarray(list(map(lambda x: label[x], tr_lang)))
y_ts = np.asarray(list(map(lambda x: label[x], ts_lang)))

In [None]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

tr_size_mb = size_mb(tr_text)
ts_size_mb = size_mb(ts_text)

print("Train data: %d sentences - %0.2f MB" % (len(tr_text), tr_size_mb))
print("Test data : %d sentences - %0.2f MB" % (len(ts_text), ts_size_mb))

## Language Identification

In [None]:
def ngrams(term, ngram_range, min_df=1, max_df=1.0):
    """
    Function to extract word or char n-gram features.

    Parameters:
        analyzer: string {'word', 'char', 'char_wb'}
            Whether the feature should be made of word n-gram or character n-grams.
            Option 'char_wb' creates character n-grams only from text inside word boundaries;
            n-grams at the edges of words are padded with space.

        ngram_range: tuple (min_n, max_n), default=(1, 1)
            The lower and upper boundary of the range of n-values for different n-grams to be extracted.
            All values of n such that min_n <= n <= max_n will be used.
            For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.

        min_df: float in range [0.0, 1.0] or int, default=1
            When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
            This value is also called cut-off in the literature.
            If float, the parameter represents a proportion of documents, integer absolute counts.

        max_df: float in range [0.0, 1.0] or int, default=1.0
            When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
            If float, the parameter represents a proportion of documents, integer absolute counts.

    Returns:
        z: dict {features: list of features, tr_analyzer_ngram: scipy.sparse.csr.csr_matrix, ts_analyzer_ngram: scipy.sparse.csr.csr_matrix}
    """
    
    i, j = ngram_range
    
    def word_ngram_analyzer(doc):
        for line in doc.split('\n'):
            terms = re.findall(r"\w+", line)
            for n in range(i, j+1):
                for ngram in zip(*[islice(seq, k, len(terms)) for k, seq in enumerate(tee(terms, n))]):
                    ngram = " ".join(ngram)
                    yield ngram
    
    try:
        if term == 'word':
            vectorizer = CountVectorizer(analyzer=word_ngram_analyzer, min_df=min_df, max_df=max_df)
        else:
            vectorizer = CountVectorizer(analyzer=term, ngram_range=(i, j), min_df=min_df, max_df=max_df)
        vectorizer.fit(tr_text)
    except ValueError:
        print("Error: After pruning, no terms remain.")
        return None

    z = dict()
    z["features"] = vectorizer.get_feature_names()
    z["tr_"+term+"_"+str(i)+str(j)] = vectorizer.transform(tr_text)
    z["ts_"+term+"_"+str(i)+str(j)] = vectorizer.transform(ts_text)

    return z

In [None]:
def classifier(clf, params, X_tr, y_tr, X_ts, y_ts):
    print("Training", clf)

    search = GridSearchCV(estimator=clf, param_grid=params, scoring='accuracy', 
                          cv=5, verbose=1, return_train_score=True)
    t0 = time()
    search.fit(X_tr, y_tr)
    tr_val_time = time() - t0
    clf = search.best_estimator_
    print("\nBest Estimator:", clf)
    print("\nTrain and validation time: %.4f seconds" % tr_val_time)

    t0 = time()
    y_true, y_pred = y_ts, clf.predict(X_ts)
    ts_time = time() - t0
    print("\nTest time: %.4f seconds" % ts_time)

    confusion_mat = metrics.confusion_matrix(y_true, y_pred)
    scores_report = metrics.classification_report(y_true, y_pred, target_names=label.keys(), output_dict=True)
    scores_report = pd.DataFrame(scores_report)
    
    print(metrics.classification_report(y_true, y_pred, target_names=label.keys(), digits=4))
    cm_disp = metrics.plot_confusion_matrix(clf, X_ts, y_ts, values_format='d', 
                                            display_labels=label.keys(),
                                            cmap=plt.cm.Blues, colorbar=False)
    plt.show()

    return search.best_params_, tr_val_time, ts_time, scores_report, cm_disp, y_pred

In [None]:
def build_ngram_model(max_n):
    ngram = range(1, max_n+1)
    ngram_range = [(i, j) for i in ngram for j in ngram if i<=j]

    outputs = {clf: dict() for clf in clflist}
    results = {clf: [] for clf in clflist}
    columns = ["Vectorizer", "#Features", "Hyperparameter", 
               "Train&Val time (s)", "Test time (s)", 
               "Precision", "Recall", "F1-score", "Accuracy"]

    for analyzer in ['word', 'char', 'char_wb']:
        for (i, j) in ngram_range:
            print('*'*80)
            print(f"Extracting frequency based {analyzer} n-gram features...")
            z = ngrams(term=analyzer, ngram_range=(i, j), min_df=0.005)
            if z is None:          
                continue
            
            vect = analyzer+"_"+str(i)+str(j)
            X_tr = z["tr_"+vect].toarray()
            X_ts = z["ts_"+vect].toarray()

            for (clf, params, descript), clfname in zip([
                    (MultinomialNB(), 
                     {'alpha': np.power(10, np.arange(-3, 2, dtype=float))}, 
                     "Multinomial Naïve Bayes Classifier"),
                    (LinearSVC(dual=False), 
                     {'C': np.power(10, np.arange(-3, 2, dtype=float))}, 
                     "Linear Support Vector Classifier")], 
                    clflist):
                best_param, tr_val_time, ts_time, scores, cm_disp, y_pred = classifier(clf, params, X_tr, y_tr, X_ts, y_ts)
                cm_df = pd.DataFrame(cm_disp.confusion_matrix, index=label.keys(), columns=label.keys())
                cm_df.to_csv(os.path.join(dirpath, 'confusion_matrix', 'cm_'+clfname+'_'+vect+'.csv'))
                result = []
                result.extend([vect, len(z["features"])])          
                result.extend(best_param.values())                 
                result.extend([tr_val_time, ts_time])               
                result.extend(scores["macro avg"].tolist()[:-1])   
                result.append(scores["accuracy"].iat[0]*100)
                results[clfname].append(dict(zip(columns, result)))
                outputs[clfname][vect] = y_pred

    return {clf: pd.DataFrame(results[clf]) for clf in clflist}, outputs

In [None]:
dirpath = "/content/drive/MyDrive/PLI/results"
results, outputs = build_ngram_model(8)