# Workshop 03

- Name: Ran Arino
- Student ID: 153073200
- Email: rarino@myseneca.ca
- Course: Social Media Analytics
- Course ID: BDA600NAA.07578.2241
- Professor: Dr. Pantea Koochemeshkian

In [18]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [3]:
# read the data set
data = pd.read_csv("full-corpus-training.csv")
data.head()

Unnamed: 0,Sentiment,TweetId,TweetText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the i...
1,positive,1.26e+17,@Apple will be adding more carrier support to ...
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet wit...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to ...
4,positive,1.26e+17,I just realized that the reason I got into twi...


In [4]:
# remove the rows whose "Sentiment" column is "irrelevant"
new_data = data[data['Sentiment'] != 'irrelevant']
new_data['Sentiment'].value_counts()

Sentiment
neutral     2228
negative     437
positive     329
Name: count, dtype: int64

In [5]:
# cleaning the texts
def clean_texts(raw_texts: list or np.array):
    # define result
    result = []

    # set of stopwords
    stop_words = set(stopwords.words('english'))
    # set the porter stemming
    porter = nltk.PorterStemmer()

    # traversing all sentences
    for sent in raw_texts:
        # (1): white space removal
        sent = sent.strip()
        # (2): URL removal
        sent = re.sub(r"http[s]?://[\w?\W?]+\s", '', sent)
        # (3): HTML tag removal
        sent = re.sub(r'<[^>]+>', '', sent)
        # (4): Repeated words (at least four times)
        sent = re.sub(r'(.)\1{4,}', r'\1', sent)
        # (5): split attached words (at least two characters and follow the capitalized word)
        sent = re.sub(r"([\w]{2,})([A-Z])", r"\1 \2", sent)
        # (6): toknize exclude punct
        tokens = nltk.regexp_tokenize(sent, r"\w+(?:'\w+)?")

        # defined cleaned sentence
        clean_sent = ''
    
        # cleaning each sentence
        for w in tokens:
            # if 'w' is one of stop words, skip to the next word
            if w.lower() in stop_words:
                continue
            # add words without stemming
            clean_sent += porter.stem(w.lower()) + ' '

        # add clean_sent to result (make sure that the last item is always blank)
        result += [clean_sent[:-1]]

    return result

# get the clean tweet data as list
sent_list = clean_texts(np.array(new_data['TweetText'].values))
sent_list[:5]

['appl get swype iphon crack iphon',
 'appl ad carrier support iphon 4s announc',
 'hilari youtub video guy duet appl siri pretti much sum love affair http co 8exbnqj',
 'ri made easi switch appl iphon see ya',
 'realiz reason got twitter ios5 thank appl']

In [6]:
# add clean text to the dataset
new_data.loc[:, 'CleanText'] = sent_list
new_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.loc[:, 'CleanText'] = sent_list


Unnamed: 0,Sentiment,TweetId,TweetText,CleanText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the i...,appl get swype iphon crack iphon
1,positive,1.26e+17,@Apple will be adding more carrier support to ...,appl ad carrier support iphon 4s announc
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet wit...,hilari youtub video guy duet appl siri pretti ...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to ...,ri made easi switch appl iphon see ya
4,positive,1.26e+17,I just realized that the reason I got into twi...,realiz reason got twitter ios5 thank appl


In [7]:
new_data['Sentiment'].value_counts()

Sentiment
neutral     2228
negative     437
positive     329
Name: count, dtype: int64

In [8]:
# apply tf-idf
# create TF-IDF vectrizer
tfidf_vect = TfidfVectorizer()
# fit and transform data
matrix = tfidf_vect.fit_transform(sent_list)

# get the explanatory and target variables for machine learning
X = matrix.toarray()
y = np.array(new_data['Sentiment'])

In [9]:
print("Explnatory Variable format: ")
print(X[:5])

print("\nTarget Variable format: ")
print(y[:5])

Explnatory Variable format: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Target Variable format: 
['positive' 'positive' 'positive' 'positive' 'positive']


In [15]:
# define statifired k-fold processes
def strat_kfold(classifier, k=5):
    # stratified k-fold cross validation
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    # fold number
    i = 1
    # performance func
    perf_func = {"Acc": accuracy_score, "Pre": precision_score, "Rec": recall_score, 
                 "F1": f1_score, "Conf": confusion_matrix}
    # set the dict of the performance results
    results = {key: [] for key in perf_func.keys()}
    # apply k-hold cross validation
    for train_index, test_index in skf.split(X, y):
        print(f"start fold {i}")
        # set the train and test data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # train the data into the model
        classifier.fit(X_train, y_train)
        # predict the target classes
        y_pred = classifier.predict(X_test)
        # calculate classificaiton performances
        for key in perf_func.keys():
            if key in ["Acc", "Conf"]:
                params = {}
            else:
                params = {'average': 'macro'}
                
            results[key].append(perf_func[key](y_test, y_pred, **params))
        
        # increment fold
        i += 1

    # create the min-max confusion matrix
    stacked_arrays = np.stack(results['Conf'], axis=0)
    # Combine min and max values into a tuple for each component
    min_max_conf = np.dstack((np.min(stacked_arrays, axis=0), np.max(stacked_arrays, axis=0)))

    # delete "Conf" key from dict
    del results['Conf']

    return pd.DataFrame(results), min_max_conf



In [16]:
# Naive Bayes Classifiers
nb_model = MultinomialNB()
nb_results, nb_conf = strat_kfold(nb_model)
print(nb_conf)
nb_results

start fold 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


start fold 2
start fold 3


  _warn_prf(average, modifier, msg_start, len(result))


start fold 4


  _warn_prf(average, modifier, msg_start, len(result))


start fold 5
[[[  1   1]
  [ 86  87]
  [  0   0]]

 [[  0   1]
  [445 446]
  [  0   0]]

 [[  0   0]
  [ 65  66]
  [  0   0]]]


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Acc,Pre,Rec,F1
0,0.744574,0.581382,0.337121,0.291927
1,0.746244,0.58194,0.337165,0.292378
2,0.746244,0.58194,0.337165,0.292378
3,0.744574,0.415131,0.336417,0.291927
4,0.745819,0.581798,0.337121,0.2922


In [17]:
# Randon forest
rf_model = RandomForestClassifier(
    n_estimators=100, class_weight='balanced',
    random_state=42, n_jobs=-1)

rf_results, rf_conf = strat_kfold(rf_model)
print(rf_conf)
rf_results

start fold 1
start fold 2
start fold 3
start fold 4
start fold 5
[[[ 29  35]
  [ 52  59]
  [  0   0]]

 [[ 12  16]
  [425 432]
  [  1   8]]

 [[  0   1]
  [ 57  62]
  [  4   8]]]


Unnamed: 0,Acc,Pre,Rec,F1
0,0.769616,0.614432,0.459766,0.482579
1,0.78798,0.797554,0.485879,0.523343
2,0.786311,0.78524,0.490081,0.522206
3,0.78798,0.767924,0.48466,0.519504
4,0.777592,0.768867,0.467094,0.498179
