In [None]:
!gdown --id 1GH682t9d8UjKtusxRa6bl0V0lkQ0DsVV

Downloading...
From: https://drive.google.com/uc?id=1GH682t9d8UjKtusxRa6bl0V0lkQ0DsVV
To: /content/train.csv
100% 17.5M/17.5M [00:00<00:00, 73.2MB/s]


In [None]:

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

Unnamed: 0,text,dialect
0,حاطينهم فوق التلاجة ولا تحت الدولاب,LY
1,واقعة سيد عبد النعيم بعين طفل عمره سنة عمرو شوقا,EG
2,باقي ايام رفع الحظر وامريكا في المشيه والجيه ت...,SD
3,خالص ما مصدق عم يغني صراحة الارتب حماقي ما بعر...,LB
4,زعما الناس تقدر تطلع وتعتصم قدام بو زي ما دارو...,LY


In [None]:
lang = train_df['dialect'].unique()
lang

array(['LY', 'EG', 'SD', 'LB', 'MA'], dtype=object)

In [None]:
train_df.iloc[0]['text']

'حاطينهم فوق التلاجة ولا تحت الدولاب'

In [None]:
train_df.dropna(inplace = True)

In [None]:
print(train_df['text'].isnull().sum())
print(train_df['text'].apply(type).value_counts())

0
<class 'str'>    132872
Name: text, dtype: int64


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def get_trigrams(corpus,n_feat=1200):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    params
    ------------
        corpus: list of strings
        n_feat: integer
    """

    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            max_features= n_feat,
                            ngram_range=(3, 3))

    X = vectorizer.fit_transform(corpus)

    #Get model feature names
    feature_names = vectorizer.get_feature_names_out()
    return feature_names

In [None]:

#obtain trigrams from each language
features = {}
features_set = set()
lang = train_df['dialect'].unique()
for l in lang:

    #get corpus filtered by language
    corpus = train_df[train_df.dialect==l]['text']
    #get 200 most frequent trigrams
    trigrams = get_trigrams(corpus)

    #add to dict and set
    features[l] = trigrams
    features_set.update(trigrams)

#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i

In [None]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train_df['text']
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
train_feat['dialect'] = list(train_df['dialect'])

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

encoder = LabelEncoder()
def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    y_encoded = encoder.fit_transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)

    return y_dummy

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split

#Get training data
X = train_feat.drop('dialect',axis=1)
y = train_feat['dialect']
X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                   random_state=42,
                                   test_size=0.1)

y_train = encode(y_train)
y_test = encode(y_test)


In [None]:
#Define model
model = Sequential()
model.add(Dense(2048, activation='relu'))
model.add(Dense(5, activation='softmax'))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train,y_train, epochs = 10, batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f64728e9b40>

In [None]:
from sklearn.metrics import accuracy_score
# Obtain predicted probabilities for each class
y_probs = model.predict(X_test)

# Get the predicted class by selecting the index with highest probability
y_pred = y_probs.argmax(axis=-1)

# Inverse transform the predicted labels
predictions = encoder.inverse_transform(y_pred)
true_label = encoder.inverse_transform(y_test.argmax(axis=-1))

# Calculate accuracy on test set
accuracy = accuracy_score(true_label, predictions)
print(accuracy)


0.7740066225165563


In [None]:
import joblib

joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']