# Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

from utils import get_data, get_label_encoder

# Exploration

In [4]:
df = get_data()
df.head()

Unnamed: 0,Lang,Text
0,GER,IThe importance and popularity of travelling ...
1,TUR,"It is an important decision , how to plan you..."
2,CHI,Some people believe that young people can enj...
3,TEL,Travelling is usually considered as good recr...
4,ARA,i agree that . Life is a person live period o...


In [5]:
np.unique(np.append(df["Lang"], "OTH"))

array(['ARA', 'CHI', 'FRE', 'GER', 'HIN', 'ITA', 'JPN', 'KOR', 'OTH',
       'SPA', 'TEL', 'TUR'], dtype=object)

In [6]:
label_encoder, labels = get_label_encoder(np.unique(np.append(df["Lang"], "OTH")))

In [7]:
labels

{'ARA': 0,
 'CHI': 1,
 'FRE': 2,
 'GER': 3,
 'HIN': 4,
 'ITA': 5,
 'JPN': 6,
 'KOR': 7,
 'OTH': 8,
 'SPA': 9,
 'TEL': 10,
 'TUR': 11}

# Notre modèle

In [8]:
def get_df_split(df, language):
    lang = df[df["Lang"]==language]

    other = df[df["Lang"]!=language]
    other = other.sample(len(lang))
    other["Lang"] = "OTH"

    df_concat = pd.concat([lang, other])
    df_concat = df_concat.sample(frac=1, random_state=42)

    X, y = df_concat["Text"], df_concat["Lang"]
    return X, y

In [9]:
def get_accuracy(y_true, y_pred):
    true = 0
    false = 0
    total_size = 0
    one_size = 0
    size_zero = 0
    true_size_one = 0

    different = []
    total_different = 0
    for i in range(len(y_true)):
        if y_pred[i] not in different:
            different.append(y_pred[i])
            total_different += 1
        if len(y_pred[i]) == 1:
            if y_true[i] == y_pred[i][0]:
                true_size_one += 1
            one_size += 1
        if len(y_pred[i]) == 0:
            size_zero += 1
        total_size += len(y_pred[i])
        if y_true[i] in y_pred[i]:
            true += 1
        else:
            false += 1
    res = {
        "accuracy": true/(false+true),
        "mean_size_tab": total_size/len(y_true),
        "ratio_size_one": one_size/len(y_true),
        "nb_different_tab": total_different/len(y_true),
        "size_zero": size_zero,
        "accuracy_size_one": true_size_one/one_size
    }
    return res

In [10]:
def train_models(train_idx, test_idx):
    df = get_data()
    array = np.append(df["Lang"], "OTH")
    label_encoder, _ = get_label_encoder(array)
    train, test = df.loc[train_idx], df.loc[test_idx]

    models = {}

    # Train the models
    for lang in df["Lang"].unique():
        clf = make_pipeline(
            CountVectorizer(ngram_range=(1, 2)),
            SGDClassifier()
        )
        X_train, y_train = get_df_split(train, lang)
        X_dev, y_dev = get_df_split(test, lang)

        y_train_labels = label_encoder.transform(y_train)
        y_dev_labels = label_encoder.transform(y_dev)

        clf.fit(X_train, y_train_labels)
        
        pred = clf.predict(X_dev)

        models[lang] = {"model":clf, "y_pred":pred, "y_true":y_dev_labels}

    results = {}
    y_true = label_encoder.transform(test["Lang"])
    # Use the models
    for model in models:
        pred = models[model]["model"].predict(test["Text"])
        results[model] = {"y_pred":pred, "y_true":y_true}
    
    pred = []
    for lang in results:
        y_pred = results[lang]["y_pred"]
        pred.append(y_pred)

    real_pred = [[] for _ in range(len(pred[0]))]
    for i in range(len(pred)):
        for j in range(len(pred[i])):
            if pred[i][j] != 8:
                real_pred[j].append(pred[i][j])

    

    return models, results, get_accuracy(y_true, real_pred)

In [11]:
X, y = df["Text"], df["Lang"]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = skf.split(X, y)
for i, (train_index, test_index) in enumerate(splits):
    train_idx = train_index
    test_idx = test_index
    _, res, acc = train_models(train_index, test_index)
    print(acc)

{'accuracy': 0.8449494949494949, 'mean_size_tab': 2.426262626262626, 'ratio_size_one': 0.1621212121212121, 'nb_different_tab': 0.1691919191919192, 'size_zero': 28, 'accuracy_size_one': 0.6978193146417445}
{'accuracy': 0.8368686868686869, 'mean_size_tab': 2.5005050505050503, 'ratio_size_one': 0.15404040404040403, 'nb_different_tab': 0.1919191919191919, 'size_zero': 22, 'accuracy_size_one': 0.7049180327868853}
{'accuracy': 0.8459595959595959, 'mean_size_tab': 2.5505050505050506, 'ratio_size_one': 0.14242424242424243, 'nb_different_tab': 0.1898989898989899, 'size_zero': 21, 'accuracy_size_one': 0.7340425531914894}
{'accuracy': 0.842929292929293, 'mean_size_tab': 2.438888888888889, 'ratio_size_one': 0.16161616161616163, 'nb_different_tab': 0.17777777777777778, 'size_zero': 30, 'accuracy_size_one': 0.728125}
{'accuracy': 0.8303030303030303, 'mean_size_tab': 2.464141414141414, 'ratio_size_one': 0.14898989898989898, 'nb_different_tab': 0.17575757575757575, 'size_zero': 27, 'accuracy_size_one'

# Implémentation Sklearn

In [12]:
def train_sklearn(train_idx, test_idx):
    train, test = df.loc[train_idx], df.loc[test_idx]

    X_train, y_train = train["Text"], train["Lang"]
    X_test, y_test = test["Text"], test["Lang"]

    label_encoder, labels = get_label_encoder(np.unique(df["Lang"]))
    vectorizer = CountVectorizer(ngram_range=(1, 2))


    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    y_train = label_encoder.transform(y_train)
    y_test = label_encoder.transform(y_test)

    base_lr = SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.1, max_iter=1000, tol=1e-3)
    # base_lin = LogisticRegression(solver="liblinear")
    ovr = OneVsRestClassifier(base_lr)
    ovr.fit(X_train, y_train)
    Y_pred_ovr = ovr.predict(X_test)
    print(accuracy_score(y_test, Y_pred_ovr))

In [13]:
X, y = df["Text"], df["Lang"]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
splits = skf.split(X, y)
for i, (train_index, test_index) in enumerate(splits):
    train_idx = train_index
    test_idx = test_index
    train_sklearn(train_index, test_index)

0.7222222222222222
0.6994949494949495
0.7121212121212122
0.7095959595959596
0.7232323232323232


# Résultats :

#### CountVectorizer()

- 0.6904040404040404 -> LogisticRegression(solver="liblinear")

#### CountVectorizer(ngram_range=(1, 2))

- 0.7388888888888889 -> LogisticRegression(solver="liblinear")
- 0.7151515151515152 -> SGDClassifier()
- 0.7136363636363636 -> SGDClassifier(loss="log_loss")
- 0.7136363636363636 -> SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.1, max_iter=1000, tol=1e-3)

#### TfidfVectorizer()

- 0.6853535353535354 -> SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.1, max_iter=1000, tol=1e-3)
- 0.6737373737373737 -> LogisticRegression(solver="liblinear")

# Test JOUR-J

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
import pandas as pd

from utils import get_data, get_label_encoder

file_name = "data/test.txt"
test = get_data(file_name, columns=["Text"])
X_test = test["Text"]

df = get_data()

X_train, y_train = df["Text"], df["Lang"]

vectorizer = CountVectorizer(ngram_range=(1, 2))

label_encoder, _ = get_label_encoder(np.unique(df["Lang"]))

X_train = vectorizer.fit_transform(X_train)
y_train = label_encoder.transform(y_train)
X_vect_test = vectorizer.transform(X_test)

base_lr = SGDClassifier()
ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train, y_train)
Y_pred_ovr = ovr.predict(X_vect_test)
df_res = pd.DataFrame(Y_pred_ovr, columns=["Lang"])
df_res["Lang"] = label_encoder.inverse_transform(df_res["Lang"])
df_res.to_csv("prediction.txt", index=False, header=False)