# Workshop 03

- Name: Ran Arino
- Student ID: 153073200
- Email: rarino@myseneca.ca
- Course: Social Media Analytics
- Course ID: BDA600NAA.07578.2241
- Professor: Dr. Pantea Koochemeshkian

In [101]:
import pandas as pd
import numpy as np
import re
import statistics

import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from gensim.models import KeyedVectors
from gensim.downloader import base_dir, load
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [11]:
# read the data set from data folder
data = pd.read_csv("data/full-corpus-training.csv")
data.head()

Unnamed: 0,Sentiment,TweetId,TweetText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the i...
1,positive,1.26e+17,@Apple will be adding more carrier support to ...
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet wit...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to ...
4,positive,1.26e+17,I just realized that the reason I got into twi...


In [12]:
# remove the rows whose "Sentiment" column is "irrelevant"
new_data = data[data['Sentiment'] != 'irrelevant']
new_data['Sentiment'].value_counts()

Sentiment
neutral     2228
negative     437
positive     329
Name: count, dtype: int64

In [84]:
# cleaning the texts
def clean_texts(raw_texts: list or np.array, tagging: bool = False):
    # define result
    result = []

    # set of stopwords
    stop_words = set(stopwords.words('english'))
    # set the porter stemming
    porter = nltk.PorterStemmer()
    # initialize tweet tokenizer
    tweet_tokenizer = TweetTokenizer()

    # function to convert emojis to text
    def convert_emojis(text):
        return emoji.demojize(text, delimiters=("", ""))

    # traversing all sentences
    for sent in raw_texts:
        # apply tagging
        if tagging:
            token = tweet_tokenizer.tokenize(sent)
            tagged_words = nltk.pos_tag(token)
            # adjective ('JJ', 'JJR', 'JJS'), noun ('NN', 'NNP'), verb('VB', 'VBD', 'VBG', 'VBN', 'VBP')
            sent = " ".join([w[0] for w in tagged_words if w[1] in ['JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP']])

        # (1): white space removal
        sent = sent.strip()
        # (2): URL removal
        sent = re.sub(r"http[s]?://[\w?\W?]+", '', sent)
        # (3): HTML tag removal
        sent = re.sub(r'<[^>]+>', '', sent)
        # (4): Repeated words (at least four times)
        sent = re.sub(r'(.)\1{4,}', r'\1', sent)
        # (5): split attached words (at least two characters and follow the capitalized word)
        sent = re.sub(r"([\w]{2,})([A-Z])", r"\1 \2", sent)
        # (6): Punctuation removal without @ and #
        sent = re.sub(r'[^\w\s]', '', sent)
        # (7): Emoji to text
        sent = convert_emojis(sent)
        # (8): toknize exclude punct
        #tokens = nltk.regexp_tokenize(sent, r"\w+(?:'\w+)?")
        #(8): tokenize with TweetTokenizer
        tokens = tweet_tokenizer.tokenize(sent)

        # defined cleaned sentence
        clean_sent = ''
    
        # cleaning each sentence
        for w in tokens:
            # if 'w' is one of stop words, skip to the next word
            if w.lower() in stop_words:
                continue
            # add words without stemming
            clean_sent += porter.stem(w.lower()) + ' '
            #clean_sent += w.lower() + ' '

        # add clean_sent to result (make sure that the last item is always blank)
        result += [clean_sent[:-1]]

    return result

# get the clean tweet data as list
sent_list = clean_texts(np.array(new_data['TweetText'].values), True)
sent_list[:5]

['appl get swype iphon crack iphon',
 'appl ad carrier support iphon announc',
 'hilari video guy duet much love affair',
 'ri made easi switch appl iphon see ya',
 'realiz reason got twitter io appl']

In [85]:
# add clean text to the dataset
new_data.loc[:, 'CleanText'] = sent_list
new_data.head()

Unnamed: 0,Sentiment,TweetId,TweetText,CleanText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the i...,appl get swype iphon crack iphon
1,positive,1.26e+17,@Apple will be adding more carrier support to ...,appl ad carrier support iphon announc
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet wit...,hilari video guy duet much love affair
3,positive,1.26e+17,@RIM you made it too easy for me to switch to ...,ri made easi switch appl iphon see ya
4,positive,1.26e+17,I just realized that the reason I got into twi...,realiz reason got twitter io appl


In [86]:
new_data['Sentiment'].value_counts()

Sentiment
neutral     2228
negative     437
positive     329
Name: count, dtype: int64

In [87]:
# apply tf-idf
# create TF-IDF vectrizer
tfidf_vect = TfidfVectorizer()
# fit and transform data
matrix = tfidf_vect.fit_transform(sent_list)

# get the explanatory and target variables for machine learning
X = matrix.toarray()
y = np.array(new_data['Sentiment'])
# Encord the target varieble for nueral network
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_cat = to_categorical(y_encoded)

In [88]:
print("Explnatory Variable format: ")
print(X[:5])

print("\nTarget Variable format: ")
print(y[:5])

print("\nEncorded Target Variable: ")
print(label_encoder.classes_)
print(y_cat[:5])

Explnatory Variable format: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Target Variable format: 
['positive' 'positive' 'positive' 'positive' 'positive']

Encorded Target Variable: 
['negative' 'neutral' 'positive']
[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [212]:
from imblearn.over_sampling import RandomOverSampler


# define the encoder for neural nerwork
label_encoder = LabelEncoder()
label_encoder.fit(y)
# compute weights for imbalanced target variable (minimum: 0.8)
classes = np.unique(y)
class_weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(enumerate(class_weights))
#class_weight_dict = {k: max(v, 0.8) for k, v in dict(enumerate(class_weights)).items()}

# define statifired k-fold processes
def strat_kfold(model_name, X, y, k=5):
    # stratified k-fold cross validation
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    # fold number
    i = 1
    # performance func
    perf_func = {"Acc": accuracy_score, "Pre": precision_score, "Rec": recall_score, 
                 "F1": f1_score, "Conf": confusion_matrix}
    # set the dict of the performance results
    results = {key: [] for key in perf_func.keys()}
    # trained model at each validation
    models = []
    # apply k-hold cross validation
    for train_index, test_index in skf.split(X, y):
        print(f"start fold {i}")
        # set train data
        X_train, X_test = X[train_index], X[test_index]
        # Neural Network
        if model_name == 'neural_network':
            # set test data
            y_train, y_test = y_cat[train_index], y_cat[test_index]
            # Neural Network Model
            model = Sequential()
            model.add(Dense(128, activation='tanh', input_shape=(X.shape[1],)))
            model.add(Dense(32, activation='tanh'))
            model.add(Dense(3, activation='softmax'))  # 3 units for 3 classes
            # Compile the model
            model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
            # Train the model
            model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.2, class_weight=class_weight_dict)
            # predict the target classes
            y_pred = np.argmax(model.predict(X_test), axis=1)
            # convert the y_test to
            y_test = np.argmax(y_test, axis=1)
            # add model
            models.append(model)
        # random forest
        elif model_name == 'random_forest':
            # set the test data
            y_train, y_test = y[train_index], y[test_index]
            # set model
            model = RandomForestClassifier(
                n_estimators=100, class_weight='balanced',
                random_state=42, n_jobs=-1
                )
            # train the data into the model
            model.fit(X_train, y_train)
            # predict the target classes
            y_pred = model.predict(X_test)
            # add model
            models.append(model)
        # naive bayes
        elif model_name == "naive_bayes":
            # set the test data
            y_train, y_test = y[train_index], y[test_index]
            # set model
            model = MultinomialNB()
            # train the data into the model
            model.fit(X_train, y_train)
            # predict the target classes
            y_pred = model.predict(X_test)
            # add model
            models.append(model)

        else:
            return None, None, None

        # calculate classificaiton performances
        for key in perf_func.keys():
            if key in ["Acc", "Conf"]:
                params = {}
            else:
                params = {'average': 'macro'}
                
            results[key].append(perf_func[key](y_test, y_pred, **params))
        
        # increment fold
        i += 1

    # create the min-max confusion matrix
    stacked_arrays = np.stack(results['Conf'], axis=0)
    # Combine min and max values into a tuple for each component
    min_max_conf = np.dstack((np.min(stacked_arrays, axis=0), np.max(stacked_arrays, axis=0)))

    # delete "Conf" key from dict
    del results['Conf']

    return pd.DataFrame(results), min_max_conf, models


In [213]:
# Naive Bayes Classifiers
nb_results, nb_conf, nb_models = strat_kfold("naive_bayes", X, y)
print(nb_conf)
nb_results

start fold 1
start fold 2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


start fold 3
start fold 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


start fold 5
[[[  0   5]
  [ 83  87]
  [  0   0]]

 [[  0   1]
  [445 446]
  [  0   0]]

 [[  0   0]
  [ 65  66]
  [  0   0]]]


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Acc,Pre,Rec,F1
0,0.746244,0.581798,0.340909,0.299524
1,0.744574,0.415131,0.336417,0.291927
2,0.744574,0.248191,0.333333,0.28453
3,0.746244,0.471104,0.340249,0.299524
4,0.752508,0.583474,0.352273,0.321648


In [124]:
# Randon forest
rf_results, rf_conf, rf_models = strat_kfold("random_forest", X, y)
print(rf_conf)
rf_results

start fold 1
start fold 2
start fold 3
start fold 4
start fold 5
[[[ 23  31]
  [ 55  64]
  [  0   2]]

 [[  8  14]
  [420 429]
  [  8  16]]

 [[  0   2]
  [ 52  58]
  [  6  13]]]


Unnamed: 0,Acc,Pre,Rec,F1
0,0.769616,0.614973,0.469632,0.499156
1,0.772955,0.677561,0.464306,0.501356
2,0.767947,0.649425,0.483006,0.521527
3,0.767947,0.635151,0.47813,0.511537
4,0.754181,0.589071,0.433995,0.458217


In [214]:
# neural network
nw_results, nw_conf, nw_models = strat_kfold("neural_network", X, y, k=5)
print(nw_conf)
nw_results

start fold 1
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
start fold 2
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
start fold 3
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
start fold 4
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
start fold 5
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[[[ 41  51]
  [ 28  37]
  [  4  10]]

 [[ 42  73]
  [236 321]
  [ 70 153]]

 [[ 

Unnamed: 0,Acc,Pre,Rec,F1
0,0.540902,0.486707,0.55683,0.478348
1,0.661102,0.5113,0.560182,0.527206
2,0.624374,0.500273,0.562817,0.511571
3,0.631052,0.488263,0.550386,0.504514
4,0.591973,0.499994,0.551788,0.494106


### Load Test Data

In [216]:
# check the apply test data
testing = pd.read_excel('data/testing_data.xlsx', header=None)
testing_answer = pd.read_excel('data/testing_data_answers.xlsx', header=None)
testing = testing.rename(columns={i: col for i, col in enumerate(['TweetId', "TweetText"])})
testing.head()

Unnamed: 0,TweetId,TweetText
0,126352268705538000,Come to the dark side üì±‚Äú@gretcheneclark: ...
1,126350302113824000,"Hey @apple, if you send me a free iPhone (any ..."
2,126349695676203008,Thank you @apple for Find My Mac - just locate...
3,126342268603998000,Thanks to @Apple Covent Garden #GeniusBar for ...
4,126325800080392000,@DailyDealChat @apple Thanks!!


In [217]:
# get the clean tweet data as list
sent_list_test = clean_texts(np.array(testing['TweetText'].values), tagging=True)
# transform to the TF-IDF
matrix_test = tfidf_vect.transform(sent_list_test)
test_data = matrix_test.toarray()
test_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Results

In [250]:
def predict_with_voting(models, X_test):
    # Collect predictions from each model
    predictions = [model.predict(X_test) for model in models]

    # Convert list of predictions to a numpy array for easy manipulation
    predictions = np.array(predictions)

    # Use majority voting for final prediction
    final_prediction = np.array([statistics.mode(predictions[:, i]) for i in range(predictions.shape[1])])

    return final_prediction

In [252]:
# Naive Bayes
# Random Forest
test_data_pred_cat = predict_with_voting(nb_models, test_data)
sum(testing_answer[1].values == test_data_pred_cat) / len(testing_answer[1].values) * 100

23.060344827586206

In [251]:
# Random Forest
test_data_pred_cat = predict_with_voting(rf_models, test_data)
sum(testing_answer[1].values == test_data_pred_cat) / len(testing_answer[1].values) * 100

30.603448275862068

In [253]:
# Neural Network

# Averaging weights of the models
weights = [model.get_weights() for model in nw_models]
new_weights = list()

for weights_list_tuple in zip(*weights):
    new_weights.append(
        np.array([np.array(weights_).mean(axis=0) for weights_ in zip(*weights_list_tuple)])
    )

# Create a new model with the same architecture
aggregated_model = Sequential([
    Dense(128, activation='relu', input_shape=(X.shape[1],)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# Set the averaged weights
aggregated_model.set_weights(new_weights)

# show the predicted labels
test_data_pred = aggregated_model.predict(test_data)
# convert to the label and invert the encoded labels
test_data_pred_cat = label_encoder.inverse_transform(np.argmax(test_data_pred, axis=1))
# Accuacy
sum(testing_answer[1].values == test_data_pred_cat) / len(testing_answer[1].values) * 100




41.16379310344828

In [260]:
# write predicted target value

with open('prediction.txt', 'w') as f:
    f.write("\n".join(list(test_data_pred_cat)))

In [261]:
# save model
import joblib
joblib.dump(aggregated_model, 'agg_neural_network.joblib')

['agg_neural_network.joblib']