In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import re
import string

from tqdm import tqdm
import light_stemmer_py36

from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

import multiprocessing
from gensim.models import Word2Vec
from time import time
from gensim.models.phrases import Phrases, Phraser
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

import nltk
nltk.download('punkt')

import mysql.connector

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Data

In [2]:
df1 = pd.read_csv('content_type_full.csv')
df1.shape

(47637, 29)

In [3]:
df2 = pd.read_csv('content_type.csv')
df2.shape

(43431, 5)

# Merging

In [4]:
df = df1.merge(df2, left_on='tweet_link', right_on='tweet_link')
df = df.drop(columns=['body_y', 'tweet_id_y']).rename(columns={'body_x':'body', 'tweet_id_x':'tweet_id'})
df['content_type'].value_counts()

opinion          35063
fact              5542
other             3608
spam              2440
hatful speech      531
sarcasm            454
Name: content_type, dtype: int64

In [5]:
df.shape

(47638, 31)

# SQL connection

In [6]:
connection = mysql.connector.connect(
  host="",
  user="data_science",
  password="data-$cience",
  database="annotation"
)

mycursor = connection.cursor(buffered=True)

### query 1

In [7]:
query = "SELECT tweet_link, body, content_type, `content_type:confidence` FROM results_content_sentiment_topic LIMIT 100000;"
mycursor.execute(query)
print(pd.read_sql(query, connection).shape)
df = df.append(pd.read_sql(query, connection));
df.shape

(64894, 4)


(112532, 31)

### query 2

In [8]:
query = "SELECT tweet_link, body, content_type, `content_type:confidence` FROM results_saudi_content_sentiment_topic WHERE content_type != 'chat' LIMIT 100000;"
mycursor.execute(query)
print(pd.read_sql(query, connection).shape)
df = df.append(pd.read_sql(query, connection));
df.shape

(7576, 4)


(120108, 31)

### Droping duplicates

In [9]:
df = df.drop_duplicates(subset="tweet_link")
df.shape

(112752, 31)

In [10]:
df['content_type'].value_counts()

opinion          70548
fact             13022
other             8224
spam              7184
hatful speech     1556
sarcasm           1248
Name: content_type, dtype: int64

# Pre-Processing

In [11]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation, 
    remove words containing numbers and removing stopwords.'''
    # to lowercase
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    # Removing URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # Removing emojis
    text = re.sub(r'&#\d*','',text)
    return text

* **Drop NaN from 'tweet_body'**

In [12]:
df.dropna(subset=['body'], how='all', inplace=True)
df.shape

(112752, 31)

* **Stemming 'body'**

In [13]:
df['body'] = df['body'].apply(light_stemmer_py36.stem, args=(True,))

* **Droping unwanted types**

In [14]:
# Removing 'hateful speach'
df = df[df['content_type'] != 'hatful speech']
# Removing 'sarcasm'
df = df[df['content_type'] != 'sarcasm']
# Removing 'hateful speach'
df = df[df['content_type'] != 'other']

df['content_type'].value_counts()

opinion    70548
fact       13022
spam        7184
Name: content_type, dtype: int64

* **Filtering df with confidence > 0.8**

In [15]:
df_filterd = df[df['content_type:confidence'] > 0.8]
df_filterd['content_type'].value_counts()

opinion    27094
fact        2216
spam        1884
Name: content_type, dtype: int64

* **Undersampling**

In [16]:
df_filterd = df_filterd.sort_values(by='content_type').reset_index()[['tweet_id', 'body', 'content_type']]
df_filterd.drop(df_filterd[df_filterd['content_type'] == 'opinion'].index[2200:], inplace=True)
df_filterd['content_type'].value_counts()

fact       2216
opinion    2200
spam       1884
Name: content_type, dtype: int64

In [17]:
df_filterd.shape

(6300, 3)

In [18]:
df_filterd.head()

Unnamed: 0,tweet_id,body,content_type
0,,عاجل | رهان: ليس قوي حر غيير عتراض علي خطوة لك...,fact
1,,اطلقت #وزارة_الداخل يوم جمعة درة «تحدي ابشر» ش...,fact
2,1.2512559817550316e+18,"هديد صريح قتل من ""مغرد"" جب زميل جمال ريان. وما...",fact
3,,كلام خطير داخل #البرلمان_التونسي.. ائب ونسي ها...,fact
4,,@Sami_Alhomood اِنَّاۤ اَنزَلۡنَـٰهُ ِی َیۡلَة...,fact


# word2vec

In [19]:
words = [nltk.word_tokenize(sent) for sent in df_filterd['body']]

In [20]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [21]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [22]:
t = time()

w2v_model.build_vocab(words, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:38:42: collecting all words and their counts
INFO - 11:38:42: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:38:42: collected 30079 word types from a corpus of 311521 raw words and 6300 sentences
INFO - 11:38:42: Loading a fresh vocabulary
INFO - 11:38:42: effective_min_count=20 retains 1531 unique words (5% of original 30079, drops 28548)
INFO - 11:38:42: effective_min_count=20 leaves 238786 word corpus (76% of original 311521, drops 72735)
INFO - 11:38:42: deleting the raw counts dictionary of 30079 items
INFO - 11:38:42: sample=6e-05 downsamples 840 most-common words
INFO - 11:38:42: downsampling leaves estimated 65459 word corpus (27.4% of prior 238786)
INFO - 11:38:42: estimated required memory for 1531 words and 300 dimensions: 4439900 bytes
INFO - 11:38:42: resetting layer weights


Time to build vocab: 0.01 mins


In [23]:
t = time()

w2v_model.train(words, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:38:42: training model with 7 workers on 1531 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 11:38:42: worker thread finished; awaiting finish of 6 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 5 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 4 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 3 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 2 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 1 more threads
INFO - 11:38:42: worker thread finished; awaiting finish of 0 more threads
INFO - 11:38:42: EPOCH - 1 : training on 311521 raw words (65313 effective words) took 0.1s, 746776 effective words/s
INFO - 11:38:43: worker thread finished; awaiting finish of 6 more threads
INFO - 11:38:43: worker thread finished; awaiting finish of 5 more threads
INFO - 11:38:43: worker thread finished; awaiting finish of 4 more thread

INFO - 11:38:44: worker thread finished; awaiting finish of 1 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 0 more threads
INFO - 11:38:44: EPOCH - 13 : training on 311521 raw words (65540 effective words) took 0.1s, 766628 effective words/s
INFO - 11:38:44: worker thread finished; awaiting finish of 6 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 5 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 4 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 3 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 2 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 1 more threads
INFO - 11:38:44: worker thread finished; awaiting finish of 0 more threads
INFO - 11:38:44: EPOCH - 14 : training on 311521 raw words (65327 effective words) took 0.1s, 757560 effective words/s
INFO - 11:38:44: worker thread finished; awaiting finish of 6 more threads
INFO - 11:38

INFO - 11:38:45: worker thread finished; awaiting finish of 3 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 2 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 1 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 0 more threads
INFO - 11:38:45: EPOCH - 26 : training on 311521 raw words (65079 effective words) took 0.1s, 787630 effective words/s
INFO - 11:38:45: worker thread finished; awaiting finish of 6 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 5 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 4 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 3 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 2 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 1 more threads
INFO - 11:38:45: worker thread finished; awaiting finish of 0 more threads
INFO - 11:38:45: EPOCH - 27 : training on 311521 raw wor

Time to train the model: 0.04 mins


In [24]:
w2v_model.wv.most_similar(positive=["عروض"])

INFO - 11:38:45: precomputing L2-norms of word weight vectors


[('خفيض', 0.9361923933029175),
 ('خصوم', 0.8997214436531067),
 ('تجر', 0.8619424104690552),
 ('ون', 0.8453816175460815),
 ('كوبون', 0.8403830528259277),
 ('نون', 0.8333255052566528),
 ('اقوي', 0.8201906681060791),
 ('127803', 0.815452516078949),
 ('128087', 0.8002704977989197),
 ('127879', 0.7798187136650085)]

In [25]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index2word)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    """
    Calculate average feature vectors for all reviews
    """
    counter = 0.
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter = counter + 1.
    return review_feature_vecs

In [26]:
vectors = df_filterd['body'].apply(nltk.word_tokenize).apply(make_feature_vec, model=w2v_model, num_features=300)

  if sys.path[0] == '':


In [27]:
matrix = np.matrix(vectors.tolist())
matrix

matrix([[ 0.04510013,  0.00696807,  0.02429755, ...,  0.04861059,
          0.02181991,  0.09750631],
        [ 0.05910603,  0.0995969 ,  0.16164218, ..., -0.0318923 ,
         -0.03813394, -0.04824292],
        [-0.00186133, -0.04256683, -0.04633546, ...,  0.07554024,
          0.03007343,  0.10521155],
        ...,
        [-0.06829782, -0.02704829,  0.03433926, ..., -0.296299  ,
         -0.09889024, -0.13089025],
        [ 0.02122662,  0.04868143,  0.14645995, ..., -0.20831859,
         -0.06780344, -0.04168772],
        [-0.02098111,  0.2796912 ,  0.26398656, ..., -0.1741605 ,
         -0.10549866, -0.09894565]], dtype=float32)

# Model Estimation and Evaluation

In [28]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding, LSTM, SimpleRNN
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [29]:
# define documents
docs = df_filterd['body']
# define class labels
labels = df_filterd['content_type']

In [30]:
len(labels)

6300

In [31]:
# integer encode the documents
vocab_size = 30000
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(len(encoded_docs))

encoded_labels = LabelEncoder().fit_transform(labels)
print(encoded_labels)

onehot_encoder = OneHotEncoder(sparse=False)
encoded_labels = encoded_labels.reshape(len(encoded_labels), 1)
onehot_encoded = onehot_encoder.fit_transform(encoded_labels)
print(onehot_encoded)

6300
[0 0 0 ... 2 2 2]
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [32]:
# pad documents to a max length of 4 words
max_length = 300
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs.shape)

(6300, 300)


In [44]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length))
model.add(Flatten())
model.add(Dense(3, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 50)           1500000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 45003     
Total params: 1,545,003
Trainable params: 1,545,003
Non-trainable params: 0
_________________________________________________________________
None


In [45]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, onehot_encoded, test_size=0.1)

In [46]:
# fit the model
model.fit(X_train, y_train, epochs=3, batch_size=64, verbose=1, validation_data=(X_test, y_test))
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 92.380953


In [77]:
y_pred = model.predict(X_test)

### Classification Report

In [85]:
# y_pred to one_hot
y_pred = (y_pred == y_pred.max(axis=1)[:,None]).astype(int)

In [84]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       212
           1       0.88      0.91      0.90       220
           2       0.98      0.94      0.96       198

   micro avg       0.92      0.92      0.92       630
   macro avg       0.93      0.92      0.93       630
weighted avg       0.93      0.92      0.92       630
 samples avg       0.92      0.92      0.92       630

