# Sina Najafi #

machine-learning project

In [None]:
# Check for GPU
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-9051d509-3109-3c74-f79b-1ad40d01c9f4)


In [None]:
# read in data.csv
import pandas as pd
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,question,label
0,طارند بالا ، چه چیزی است؟,1
1,طارند بالا ، روستایی از توابع کجا است؟,1
2,طارند بالا ، روستایی از توابع کجا در استان ته...,1
3,طارند بالا ، روستایی از توابع بخش جلیل آباد ش...,1
4,کجا روستایی در استان تهران ایران است,1


In [None]:
# change -1 labels to 0
for index, row in df.iterrows():
  if(row['label'] < 0):
    df.at[index , 'label'] = 0

# now Let's check how many examples of each label we have.
df['label'].value_counts()

1    1861
0     790
Name: label, dtype: int64

In [None]:
# splitting the data to the train and test sets
import numpy as pd
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['question'].to_numpy(), df['label'].to_numpy(), test_size=0.2)
print(X_train[:7])
print(y_train[:7])

['طاخونه نام یکی از روستاهای کجا و شهرستان خنج می باشد؟'
 'تابستان کاستاریکایی یک فیلم کمدی آمریکایی در چه زمانی متهو است  : سال ۲۰۱۰'
 ' آا ان هونز یک منطقه مسکونی در کجا است؟'
 ' طارند بالا ، روستایی از توابع بخش جلیل آباد شهرستان پیشوا در کجا است؟'
 ' سابمارین کلاس بری یچه چیزی است که طول آن ۱۷۰ متر می\u200cباشد؟'
 'ساب پاپ یک شرکت کجایی نشر موسیقی است که در سال ۱۹۸۶ توسط جاناتان پانم ن و بروس پاویت در سیاتل ، واشینگتن تأسیس شد  : آمریکایی'
 'سابرینا فریلی یک هنرپیشه اهل کجا است؟']
[1 0 1 1 1 1 1]


In [None]:
# converting text into number
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# for the output_sequence_length we'll use the average number of tokens per sentence in the training set(plus 1 just in case :) ).
# Find average number of tokens (words) in training sentences
round(sum([len(i.split()) for i in X_train])/len(X_train))

14

In [None]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a sentence does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [None]:
# Create sample sentence and tokenize it
sample_sentence = "چه روستا چه شهر فرقی نمیکند وقتی تو خود زندان خود هستی"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  5,  42,   5,  67,   1,   1, 803, 999, 481,   1, 481,   1,   0,
          0,   0]])>

In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:10] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 1706
Top 5 most common words: ['', '[UNK]', 'در', 'است', 'کجا', 'چه', 'از', 'و', 'یک', 'که']
Bottom 5 least common words: ['آساشیو', 'آرچی', 'آرواره', 'آتشین', 'آآکلمبیانا']


In [None]:
# create an embedding layer
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding

<keras.layers.embeddings.Embedding at 0x7f537e1f2690>

In [None]:
# Build model with the Functional API
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

# Compile model
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the model
model_1_history = model_1.fit(X_train, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              y_train,
                              epochs=5,
                              validation_data=(X_test, y_test)
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Check the results 
# we should get the same result as above(val_accuracy = 0.7891)
model_1.evaluate(X_test, y_test)



[0.4230875074863434, 0.7890772223472595]

In [None]:
#  Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1.predict(X_test)
model_1_pred_probs[:10] # only print out the first 10 prediction probabilities

array([[0.5954859 ],
       [0.90590787],
       [0.36597216],
       [0.81743807],
       [0.8722389 ],
       [0.7326762 ],
       [0.61926425],
       [0.7889109 ],
       [0.8287709 ],
       [0.47493285]], dtype=float32)

In [None]:
X_test[:10]
y_test[:10]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1])