## Classifying Tweets as written by a Male or Female 

**Sequential Network Model**

In [62]:
# some necessary packages
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelEncoder
import pickle
import numpy as np
import pandas as pd

# set seed for reproducibility
np.random.seed(1234)

In [63]:
df = pd.read_csv('twit.csv', header=0, usecols=[5,19], encoding='latin-1')


# remove all entries with unknown and brand as gender
df = df[df['gender'] != 'unknown']
df = df[df['gender'] != 'brand']

# drop nan values
df.dropna(inplace=True,axis=0)

# convert genders from female and male to 1 and 0 respectively
df.gender = [1 if each == "female" else 0 for each in df.gender] 


print('rows and columns:', df.shape)
print(df.head())

rows and columns: (12894, 2)
   gender                                               text
0       0  Robbie E Responds To Critics After Win Against...
1       0  ÛÏIt felt like they were my friends and I was...
2       0  i absolutely adore when louis starts the songs...
3       0  Hi @JordanSpieth - Looking at the url - do you...
4       1  Watching Neighbours on Sky+ catching up with t...


In [64]:
# split df into train and test
i = np.random.rand(len(df)) < 0.8
train = df[i]
test = df[~i]
print("train data size: ", train.shape)
print("test data size: ", test.shape)

train data size:  (10326, 2)
test data size:  (2568, 2)


In [65]:
# set up X and Y
num_labels = 2
vocab_size = 25000
batch_size = 350

# fit the tokenizer on the training data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train.text)

x_train = tokenizer.texts_to_matrix(train.text, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test.text, mode='tfidf')

encoder = LabelEncoder()
encoder.fit(train.gender)
y_train = encoder.transform(train.gender)
y_test = encoder.transform(test.gender)

# check shape
print("train shapes:", x_train.shape, y_train.shape)
print("test shapes:", x_test.shape, y_test.shape)
print("test first five labels:", y_test[:5])

train shapes: (10326, 25000) (10326,)
test shapes: (2568, 25000) (2568,)
test first five labels: [1 1 1 0 1]


In [66]:
# fit model
model = models.Sequential()
model.add(layers.Dense(32, input_dim=vocab_size, kernel_initializer='normal', activation='relu'))
model.add(layers.Dense(1, kernel_initializer='normal', activation='sigmoid'))
 
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
 # Added early stoppage to prevent overfitting
early_stopping = EarlyStopping(
    min_delta=0.001, # min change in metrics
    patience=8,     # min epochs 
    restore_best_weights=True,
)

history = model.fit(x_train, y_train, callbacks=[early_stopping],
                    batch_size=batch_size,
                    epochs=30,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30


In [67]:
# evaluate
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('Accuracy: ', score[1])

Accuracy:  0.6179906725883484


In [68]:
print(score)

[0.6554991602897644, 0.6179906725883484]


In [69]:
# get predictions so we can calculate more metrics
pred = model.predict(x_test)
pred_labels = [1 if p>0.5 else 0 for p in pred]



In [70]:
pred[:10]

array([[0.36923313],
       [0.5102895 ],
       [0.6064557 ],
       [0.24041091],
       [0.57609755],
       [0.43390504],
       [0.5763473 ],
       [0.24579242],
       [0.44657132],
       [0.6130683 ]], dtype=float32)

In [71]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('accuracy score: ', accuracy_score(y_test, pred_labels))
print('precision score: ', precision_score(y_test, pred_labels))
print('recall score: ', recall_score(y_test, pred_labels))
print('f1 score: ', f1_score(y_test, pred_labels))

accuracy score:  0.6179906542056075
precision score:  0.6358897989575577
recall score:  0.6344725111441307
f1 score:  0.6351803644477502


**Recurrent NN**

In [72]:
from tensorflow.keras import preprocessing
max_features = 10000
maxlen = 500
batch_size = 500

# pad the data to maxlen
x_train2 = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test2 = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

# build a Sequential model with Embedding and SimpleRNN layers

model2 = models.Sequential()
model2.add(layers.Embedding(max_features, 32))
model2.add(layers.SimpleRNN(32))
model2.add(layers.Dense(1, activation='sigmoid'))

In [73]:
model2.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________


In [74]:
# compile
model2.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [75]:
# train

history2 = model2.fit(x_train2,
                    y_train,
                    epochs=10,
                    batch_size=batch_size,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
from sklearn.metrics import classification_report

pred2 = model2.predict(x_test2)
pred2 = [1.0 if p>= 0.5 else 0.0 for p in pred2]
print(classification_report(y_test, pred2))

              precision    recall  f1-score   support

           0       0.33      0.00      0.00      1222
           1       0.52      1.00      0.69      1346

    accuracy                           0.52      2568
   macro avg       0.43      0.50      0.34      2568
weighted avg       0.43      0.52      0.36      2568



**Convolutional NN**

In [77]:
max_features = 10000
maxlen = 25000
batch_size = 100

x_train3 = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test3 = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [78]:
x_train3.shape

(10326, 25000)

In [79]:
# build a Sequential model 1D convnet

model3 = models.Sequential()
model3.add(layers.Embedding(max_features, 128, input_length=maxlen)) 
model3.add(layers.Conv1D(32, 7, activation='relu')) 
model3.add(layers.MaxPooling1D(5)) 
model3.add(layers.Conv1D(32, 7, activation='relu')) 
model3.add(layers.GlobalMaxPooling1D())
model3.add(layers.Dense(1))

In [80]:
model3.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 25000, 128)        1280000   
                                                                 
 conv1d_4 (Conv1D)           (None, 24994, 32)         28704     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 4998, 32)         0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 4992, 32)          7200      
                                                                 
 global_max_pooling1d_2 (Glo  (None, 32)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_16 (Dense)            (None, 1)               

In [81]:
# compile

model3.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),  # set learning rate
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [82]:
# train

history = model.fit(x_train3,
                    y_train,
                    epochs=10,
                    batch_size=batch_size,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [83]:
pred3 = model3.predict(x_test3)
pred3 = [1.0 if p>= 0.5 else 0.0 for p in pred3]
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.48      1.00      0.64      1222
           1       0.00      0.00      0.00      1346

    accuracy                           0.48      2568
   macro avg       0.24      0.50      0.32      2568
weighted avg       0.23      0.48      0.31      2568



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Embedding**

In [84]:
maxlen = 20

x_train4 = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test4 = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [85]:
# set up the Embedding layer in a Sequential model

model4 = models.Sequential()
model4.add(layers.Embedding(max_features, 8, input_length=maxlen))
model4.add(layers.Flatten())
model4.add(layers.Dense(16, activation='relu'))
model4.add(layers.Dense(1, activation='sigmoid'))

model4.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model4.summary()

history4 = model4.fit(x_train4, y_train, epochs=10, batch_size=batch_size, validation_split=0.2)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 20, 8)             80000     
                                                                 
 flatten_2 (Flatten)         (None, 160)               0         
                                                                 
 dense_17 (Dense)            (None, 16)                2576      
                                                                 
 dense_18 (Dense)            (None, 1)                 17        
                                                                 
Total params: 82,593
Trainable params: 82,593
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [86]:
pred4 = model4.predict(x_test4)
pred4 = [1.0 if p>= 0.5 else 0.0 for p in pred4]
print(classification_report(y_test, pred4))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1222
           1       0.52      1.00      0.69      1346

    accuracy                           0.52      2568
   macro avg       0.26      0.50      0.34      2568
weighted avg       0.27      0.52      0.36      2568



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Analysis

I was at first confused by the low percentage accuracy I got when looking at the results for these models.  I then went on kaggle and looked at other user's notebooks using the same dataset and the low accuracy seemed to actually be normal. I assume this is a result of the problem inherntly being complex making it hard for the networks to pick up on patterns. Comparing the accuracy of the different models it seemd that the sequential model performed the best with a 61% accuracy while, RNN, CNN, and embedding approaches all produced around a 50% accuracy. RNN innitially performed better than CNN but it was taking hours to complete so I had to increase the batch size. After I did some research it seems that sequential can perform better than RNN and CNN when input data has no temporal or spatial structure. Meaning that the order of the words dont matter as much as word choice. I think this result makes sense because there has been a lot of research done in how men and women text differently, for example women are 3.5x more likely to use emoticons. This fact would most likely transfer to twitter as well.