In [12]:
import pandas as pd
import numpy as np
import joblib
pd.set_option('display.max_colwidth', 100000)

In [13]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [32]:
data=pd.read_csv('datasets/cleaned_data.csv')

In [44]:
data.sample(5)

Unnamed: 0,id,tweet,dialect
137591,459965239623422016,اهلا فيك اخوي انا الحمد الله بخير ومشتاق للجميع والله,SY
236930,771098650604961792,للاسف ايام ما ترجع الزمن الجميل,DZ
21891,1164225427713462272,فكرت في اي تي طلع اول سمستر حديد يحك في بعضه,LY
341777,1088588079215067136,النومه اليوم بدون ألارم من غير شر,KW
307445,1075864530255798400,يلي مفكر ينزل يتظاهر هلاء واكيد معو حق فيا لأن الوضع فعلا ماعاد ينحمل بس بحب قلو الله يقويك بس نازلين معك ناس انت بالاساس نازل ضد تسلطن وفسادن راجع حساباتك صديقي وبكل الاحوال الطقس رح يكون عاطل,LB


In [33]:
# count words in each tweet
len_texts=[]
for text in data['tweet']:
  len_texts.append(len(text.split()))
  

In [34]:
from collections import Counter
count = Counter(len_texts) 
print(count.most_common())


[(4, 43663), (5, 41703), (6, 38200), (7, 33711), (8, 30146), (9, 26460), (10, 24110), (11, 21526), (12, 19310), (13, 16807), (14, 14906), (15, 13083), (16, 11970), (17, 10752), (18, 10020), (19, 9566), (20, 9276), (21, 9033), (22, 8426), (23, 7627), (24, 6744), (25, 5639), (26, 4744), (27, 3787), (28, 3013), (29, 2514), (30, 2261), (31, 1936), (32, 1806), (33, 1682), (34, 1530), (35, 1447), (36, 1446), (37, 1366), (38, 1278), (39, 1214), (47, 1210), (40, 1178), (41, 1160), (44, 1152), (46, 1151), (45, 1136), (42, 1107), (43, 1099), (48, 1090), (49, 1046), (50, 979), (51, 819), (52, 730), (53, 583), (54, 377), (55, 274), (56, 149), (57, 107), (58, 56), (59, 49), (60, 22), (61, 6), (62, 5), (65, 2), (68, 2), (63, 2), (64, 1), (0, 1), (88, 1), (70, 1)]


In [4]:
data.drop(data[data['tweet']==' '].index,inplace=True) #drop one row with 0 words in tweet column


In [None]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(data['dialect'])

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['tweet'],
                                                    y_encoded,
                                                    test_size = 0.01,
                                                    stratify=y_encoded,
                                                    random_state=42)

In [7]:
#convert y to categorical like one-hot-encoding
y_train_encoded = tf.keras.utils.to_categorical(y_train) 
y_test_encoded = tf.keras.utils.to_categorical(y_test)

In [8]:
max_words = 1000
max_len = 50
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tok.fit_on_texts(x_train)
sequences = tok.texts_to_sequences(x_train)
X_train_seq = tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=max_len)

In [None]:
lstm_bi = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(max_words, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(max(y_encoded)+1, activation='softmax')
])

In [None]:
lstm_bi.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
history = lstm_bi.fit(X_train_seq, y_train_encoded, epochs=5,
                     validation_split=0.2,
                      callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/5
Epoch 2/5


In [None]:
test_sequences = tok.texts_to_sequences(x_test)
test_sequences_matrix = tf.keras.preprocessing.sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = lstm_bi.evaluate(test_sequences_matrix,y_test_encoded)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.949
  Accuracy: 0.372


In [26]:
joblib.dump(encoder,'dialect_api/models/LabelEncoder.joblib',compress=9) #save label encoder 

['dialect_api/models/LabelEncoder.joblib']

In [27]:
joblib.dump(tok,'dialect_api/models/tokenizer.joblib',compress=9) #save tokenizer

['dialect_api/models/tokenizer.joblib']

In [None]:
lstm_bi.save('dialect_api/models/lstm_bi.h5') #save lstm model

### try to load model and test prediction

In [16]:
lstmBi_model = tf.keras.models.load_model('dialect_api/models/lstm_bi.h5')
tokin=joblib.load('dialect_api/models/tokenizer.joblib')
le=joblib.load('dialect_api/models/LabelEncoder.joblib')

In [45]:
texts=['اشتاقتلك خوى','المجتمع ديالنا بكل صراحة صعيب بزاف']

In [46]:
sequences = tokin.texts_to_sequences(texts)
seq = tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=50)
pred=np.argsort(lstmBi_model.predict(seq))
response=[]
for i,t in enumerate(texts):
  class_num=pred[i][-1]
  row_pred = {}
  row_pred['text'] = t
  row_pred['prediction'] = le.inverse_transform([class_num])[0]
  response.append(row_pred)

response

[{'prediction': 'KW', 'text': 'اشتاقتلك خوى'},
 {'prediction': 'LB', 'text': 'المجتمع ديالنا بكل صراحة صعيب بزاف'}]