## Imports necessary libraries

In [1]:
import pandas as pd
import numpy as np
from resources.arabic_preprocessing import Arabic_preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from keras.layers import Dense, LSTM, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping
from keras.optimizers import RMSprop

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

Using TensorFlow backend.


## Reading Dataset

In [2]:
sample=pd.read_csv('data/joy_train.csv',header=None, names=['index', 'tweet','emotion','score'])
sample.head()

Unnamed: 0,index,tweet,emotion,score
0,2018-ar-joy-train-1,منسم يقول لصيدلي\n\nعندك شريط بنادول؟\nقاله:نع...,joy,0.422
1,2018-ar-joy-train-2,· بتكبرين ياللي من رضا الأيام جيتي يوم ميلادك ...,joy,0.565
2,2018-ar-joy-train-3,#هدوء #عمرو دياب #مزاج ❤️❤️❤️❤️❤️,joy,0.438
3,2018-ar-joy-train-4,سبحان الله #رضى الوالدين له سحر عجيب,joy,0.375
4,2018-ar-joy-train-5,الله يرضى عليك. \nبدل متقولي: (عيد سعيد)، قولي...,joy,0.438


## Preprocess Tweets

In [3]:
prep = Arabic_preprocessing()
sample['tweet']=sample['tweet'].apply(lambda x : prep.preprocess_arabic_text(x))
sample.head()

Unnamed: 0,index,tweet,emotion,score
0,2018-ar-joy-train-1,منسم قول لصيدلي عند شريط بنادول قالهنعم قال شغ...,joy,0.422
1,2018-ar-joy-train-2,· بتكبر الي رضا ايم جيتي ميلاد فرح عمر مو اريخ...,joy,0.565
2,2018-ar-joy-train-3,هدوء عمرو دياب مزاج حب ️ حب ️ حب ️ حب ️ حب ️,joy,0.438
3,2018-ar-joy-train-4,اله رضي والد سحر عجيب,joy,0.375
4,2018-ar-joy-train-5,اله رضي بدل متقولي عيد سعيد قولي قدام 6 ليالي ...,joy,0.438


## Preparing Training Data

In [4]:
X = sample['tweet'].tolist()
y_train = sample['score'].tolist()
count_vect = CountVectorizer(lowercase=False, token_pattern=r'\S+')
count_vect = count_vect.fit(X)  # bag of words
tfidf = TfidfTransformer()
tfidf = tfidf.fit(count_vect.transform(X))
X_train_count = count_vect.transform(X)
X_train_tfidf = tfidf.transform(X_train_count.toarray())
input_count = X_train_count.shape[1]
print(input_count)

3019


## Preparing the model : Neural Network

In [14]:
def baseline_model(input_len, learning_rate=0.001):
    # create model
    model = Sequential()
    model.add(Dense(4000, input_dim=input_len, activation='relu'))
    model.add(Dense(2000, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(1000, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(500, activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(1))
    # Compile model
    rmsprop = RMSprop(lr=learning_rate)
    model.compile(loss='mse', optimizer=rmsprop, metrics=['mse'])
    return model

## Training the Model
**run either 'Without GridSearch' or 'Manual Grid Search', not both!**

#### Without GridSearch

In [15]:
early_stop = EarlyStopping(monitor='mean_squared_error', patience=5)
callbacks = [early_stop]
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=32, input_len=input_count)
#history = estimator.fit(X_train_count.toarray(), y_train)
history = estimator.fit(X_train_tfidf.toarray(), y_train, callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


#### Manual Grid Search

In [None]:
early_stop = EarlyStopping(monitor='mean_squared_error', patience=5)
callbacks = [early_stop]

learning_rates = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
models = []
histories = []
for lr in learning_rates:
    # evaluate model with standardized dataset
    estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=32, input_len=input_count, learning_rate=lr)
    #history = estimator.fit(X_train_count.toarray(), y_train)
    history = estimator.fit(X_train_tfidf.toarray(), y_train, callbacks=callbacks)
    models.append(estimator)
    histories.append(history)

## Preparing test data

In [17]:
test_sample=pd.read_csv('joy_test.csv',header=None, names=['index', 'tweet','emotion','score'])
test_sample['tweet']=test_sample['tweet'].apply(lambda x : prep.preprocess_arabic_text(x))

X_test = test_sample['tweet'].tolist()
y_test = test_sample['score'].tolist()
X_test_count = count_vect.transform(X_test)
X_test_tfidf = tfidf.transform(X_test_count.toarray())

## Evaluating the model

In [18]:
y_pred = estimator.predict(X_test_tfidf.toarray())
y_pred_train = estimator.predict(X_train_tfidf.toarray())
r2 = r2_score(y_test, y_pred)
r2_train = r2_score(y_train, y_pred_train)
rmse = mean_squared_error(y_test, y_pred)
rmse_train = mean_squared_error(y_train, y_pred_train)
print("Train: R2 : {0:f}, RMSE : {1:f}".format( r2_train, rmse_train ) )
print( "Test: R2 : {0:f}, RMSE : {1:f}".format( r2, rmse ) )

Train: R2 : 0.992522, RMSE : 0.000254
Test: R2 : -0.039579, RMSE : 0.033070


In [20]:
for i in range(22,44):
    print(y_test[-i], ' >> ', y_pred[-i], ' >> ', count_vect.inverse_transform(X_test_count[-i]))

0.8440000000000001  >>  0.64284  >>  [array(['ارب', 'اله', 'ايم', 'ايمي', 'جعل', 'جمل', 'حب', 'خير', 'رفيق',
       'زفاف', 'سعد', 'عروس', 'فرح', 'لك', 'موعد', 'يا', 'يتم', '💍'], 
      dtype='<U22')]
0.563  >>  0.571871  >>  [array(['ساعت', 'سعاد', 'فيهم'], 
      dtype='<U22')]
0.43799999999999994  >>  0.521823  >>  [array(['اول', 'حقيق', 'خير', 'دعم', 'رجال', 'زعام', 'شكر', 'عرس', 'غير',
       'قدم', 'قلب', 'كبير', 'لجميع', 'هلال'], 
      dtype='<U22')]
0.25  >>  0.588215  >>  [array(['بس', 'تام', 'حب', 'حزن', 'حين', 'حيوي', 'شعور', 'طاق', 'ومو'], 
      dtype='<U22')]
0.359  >>  0.578135  >>  [array(['بالي', 'حلم', 'رب', 'سما', 'مو', 'يا'], 
      dtype='<U22')]
0.703  >>  0.699126  >>  [array(['انو', 'بتحب', 'جو', 'سعاد', 'شخص', 'عالم', 'عيد', 'فرح'], 
      dtype='<U22')]
0.75  >>  0.667876  >>  [array(['بتهج', 'سخر', 'شتغل', 'طبيعي', 'علي', 'كفي', 'معاي', 'مو'], 
      dtype='<U22')]
0.452  >>  0.314568  >>  [array(['اثر', 'تشويق'], 
      dtype='<U22')]
0.172  >>  0.479073  >