# Imports

In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from utils import (get_data, display_results,
                   get_label_encoder, get_train_dev_test)

In [2]:
df = get_data()
df.head()

Unnamed: 0,Lang,Text
0,GER,IThe importance and popularity of travelling ...
1,TUR,"It is an important decision , how to plan you..."
2,CHI,Some people believe that young people can enj...
3,TEL,Travelling is usually considered as good recr...
4,ARA,i agree that . Life is a person live period o...


In [3]:
np.unique(np.append(df["Lang"], "OTH"))

array(['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'OTH',
       'SPA', 'TEL', 'TUR'], dtype=object)

In [4]:
label_encoder, labels = get_label_encoder(np.unique(np.append(df["Lang"], "OTH")))

In [5]:
labels

{'ARA': 0,
 'CHI': 1,
 'FRE': 2,
 'GER': 3,
 'HIN': 4,
 'ITA': 5,
 'JPN': 6,
 'KOR': 7,
 'OTH': 8,
 'SPA': 9,
 'TEL': 10,
 'TUR': 11}

In [6]:
from sklearn.model_selection import StratifiedKFold

In [7]:
X, y = df["Text"], df["Lang"]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = skf.split(X, y)

for i, (train_index, test_index) in enumerate(splits):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    train_idx = train_index
    test_idx = test_index

Fold 0:
  Train: index=[   1    3    4 ... 9895 9897 9898]
  Test:  index=[   0    2   20 ... 9892 9896 9899]
Fold 1:
  Train: index=[   0    1    2 ... 9896 9898 9899]
  Test:  index=[   4    7    9 ... 9887 9890 9897]
Fold 2:
  Train: index=[   0    2    3 ... 9896 9897 9899]
  Test:  index=[   1   11   25 ... 9875 9884 9898]
Fold 3:
  Train: index=[   0    1    2 ... 9897 9898 9899]
  Test:  index=[   8   13   14 ... 9882 9888 9893]
Fold 4:
  Train: index=[   0    1    2 ... 9897 9898 9899]
  Test:  index=[   3    5    6 ... 9883 9894 9895]


In [8]:
def get_df_split(df, language):
    lang = df[df["Lang"]==language]

    other = df[df["Lang"]!=language]
    other = other.sample(len(lang))
    other["Lang"] = "OTH"

    df_concat = pd.concat([lang, other])
    df_concat = df_concat.sample(frac=1, random_state=42)

    X, y = df_concat["Text"], df_concat["Lang"]
    return X, y

In [9]:
def get_accuracy(y_true, y_pred):
    true = 0
    false = 0
    total_size = 0
    one_size = 0
    size_zero = 0
    true_size_one = 0

    different = []
    total_different = 0
    for i in range(len(y_true)):
        if y_pred[i] not in different:
            different.append(y_pred[i])
            total_different += 1
        if len(y_pred[i]) == 1:
            if y_true[i] == y_pred[i][0]:
                true_size_one += 1
            one_size += 1
        if len(y_pred[i]) == 0:
            size_zero += 1
        total_size += len(y_pred[i])
        if y_true[i] in y_pred[i]:
            true += 1
        else:
            false += 1

    return true/(false+true), total_size/len(y_true), one_size/len(y_true), total_different/len(y_true), size_zero, true_size_one/one_size

In [10]:
def train_models(df):
    array = np.append(df["Lang"], "OTH")
    label_encoder, labels = get_label_encoder(array)
    train, test = df.loc[train_idx], df.loc[test_idx]

    models = {}

    # Train the models
    for lang in df["Lang"].unique():
        clf = make_pipeline(
            TfidfVectorizer(), 
            SGDClassifier()
        )
        X_train, y_train = get_df_split(train, lang)
        X_dev, y_dev = get_df_split(test, lang)

        y_train_labels = label_encoder.transform(y_train)
        y_dev_labels = label_encoder.transform(y_dev)

        clf.fit(X_train, y_train_labels)
        
        pred = clf.predict(X_dev)

        models[lang] = {"model":clf, "y_pred":pred, "y_true":y_dev_labels}
        print(f"Accuracy for {lang}: {accuracy_score(y_dev_labels, pred)}")
    
    results = {}
    y_true = label_encoder.transform(test["Lang"])
    # Use the models
    for model in models:
        pred = models[model]["model"].predict(test["Text"])
        results[model] = {"y_pred":pred, "y_true":y_true}
    
    pred = []
    for lang in results:
        y_pred = results[lang]["y_pred"]
        pred.append(y_pred)

    real_pred = [[] for _ in range(len(pred[0]))]
    for i in range(len(pred)):
        for j in range(len(pred[i])):
            if pred[i][j] != 8:
                real_pred[j].append(pred[i][j])

    

    return models, results, get_accuracy(y_true, real_pred)

In [11]:
models, results, acc = train_models(df)

Accuracy for GER: 0.8861111111111111
Accuracy for TUR: 0.8611111111111112
Accuracy for CHI: 0.85
Accuracy for TEL: 0.9305555555555556
Accuracy for ARA: 0.7861111111111111
Accuracy for SPA: 0.8055555555555556
Accuracy for HIN: 0.8416666666666667
Accuracy for JPN: 0.8611111111111112
Accuracy for KOR: 0.8333333333333334
Accuracy for FRE: 0.8555555555555555
Accuracy for ITA: 0.8361111111111111


In [12]:
acc

(0.8631313131313131,
 2.4575757575757575,
 0.1606060606060606,
 0.17929292929292928,
 21,
 0.7578616352201258)

# Implémentation Sklearn

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
from sklearn.multiclass import OneVsRestClassifier

train, test = df.loc[train_idx], df.loc[test_idx]

X_train, y_train = train["Text"], train["Lang"]
X_test, y_test = test["Text"], test["Lang"]

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

base_lr = SGDClassifier()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, y_train)
Y_pred_ovr = ovr.predict(X_test)

In [15]:
accuracy_score(y_test, Y_pred_ovr)

0.7196969696969697