<a href="https://colab.research.google.com/github/Runshi-Yang/JSC270_HW4_2022_Runshi/blob/main/Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 0: Dataset Preprocessing

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem.porter import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# read data from csv files
train_data = pd.read_csv('https://raw.githubusercontent.com/jsAiyaya/JSC270_Data_Science_1/main/covid-tweets-train.csv', encoding = 'utf-8')
test_data = pd.read_csv('https://raw.githubusercontent.com/jsAiyaya/JSC270_Data_Science_1/main/covid-tweets-test.csv', encoding = 'utf-8')

In [None]:
# delete the useless column
train_data = train_data.drop(['Unnamed: 0'], axis = 1)
test_data = test_data.drop(['Unnamed: 0'], axis = 1)

In [None]:
# have a glance of the first few observations of the dataset
train_data.head(5)

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,advice Talk to your neighbours family to excha...,2
2,Coronavirus Australia: Woolworths to give elde...,2
3,My food stock is not the only one which is emp...,2
4,"Me, ready to go at supermarket during the #COV...",0


In [None]:
# notice that the data type of Sentiment in train_data is object
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41155 entries, 0 to 41154
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  41155 non-null  object
 1   Sentiment      41153 non-null  object
dtypes: object(2)
memory usage: 643.2+ KB


In [None]:
# however, the data type of Sentiment in test_data is int
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  3798 non-null   object
 1   Sentiment      3798 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.5+ KB


In [None]:
# first remove observations with Sentiment value other than '0', '1', '2' (in train_data) or 0, 1, 2 (in test_data)
train_data.drop(train_data.index[(train_data['Sentiment'] != '0') & (train_data['Sentiment'] != '1') & (train_data['Sentiment'] != '2')], inplace= True)
test_data.drop(test_data.index[(test_data['Sentiment'] != 0) & (test_data['Sentiment'] != 1) & (test_data['Sentiment'] != 2)], inplace= True)
# then change the data type of Sentiment in train_data to int
train_data['Sentiment'] = train_data['Sentiment'].astype('int')

Now, we've finished processing the data we need in Part 1.

# **Part I: Sentiment Analysis with a Twitter Dataset**

### **(A)**

In [None]:
count_table = train_data.groupby('Sentiment').count()
count_table.columns = ["count"]
print(count_table, "\n")

total_num = len(train_data)
prop_table = count_table / total_num
prop_table.columns = ["proportion"]
print(prop_table)

           count
Sentiment       
0          15397
1           7712
2          18042 

           proportion
Sentiment            
0            0.374159
1            0.187407
2            0.438434


### **(B)**

In [None]:
def tokenize(df):
  df['tokens'] = df['OriginalTweet'].apply(nltk.word_tokenize)

tokenize(train_data)
tokenize(test_data)

### **(C)**

In [None]:
# remove tokens beginning with http
def remove_url(df):
  tokens_no_url = []
  for row in df['tokens']:
    # delete pattern http
    tokens_no_url.append([re.sub('http.*', '', t) for t in row])
  df['tokens'] = tokens_no_url

remove_url(train_data)
remove_url(test_data)

### **(D)**

In [None]:
def remove_punct(df):
  tokens_no_punct = []
  for row in df['tokens']:
    tokens_no_punct.append([re.sub('[^\w\s]', '', t) for t in row])
  df['tokens'] = tokens_no_punct

remove_punct(train_data)
remove_punct(test_data)

In [None]:
def lowercase(df):
  lowercase_tokens = []
  for row in df['tokens']:
    lowercase_tokens.append([t.lower() for t in row])
  df['tokens'] = lowercase_tokens

lowercase(train_data)
lowercase(test_data)
train_data.head()

Unnamed: 0,OriginalTweet,Sentiment,tokens
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1,"[, menyrbie, , phil_gahan, , chrisitv, , , tco..."
1,advice Talk to your neighbours family to excha...,2,"[advice, talk, to, your, neighbours, family, t..."
2,Coronavirus Australia: Woolworths to give elde...,2,"[coronavirus, australia, , woolworths, to, giv..."
3,My food stock is not the only one which is emp...,2,"[my, food, stock, is, not, the, only, one, whi..."
4,"Me, ready to go at supermarket during the #COV...",0,"[me, , ready, to, go, at, supermarket, during,..."


### **(E)**

In [None]:
# We use the Porter stemmer
stemmer = PorterStemmer()

def stem(df):
  stemmed_tokens = []
  for row in df['tokens']:
    stemmed_tokens.append([stemmer.stem(t) for t in row])
  df['tokens'] = stemmed_tokens

stem(train_data)
stem(test_data)

### **(F)**

In [None]:
sw = stopwords.words('english')[:100]

def remove_stopwords(df):
  tokens_no_sw = []
  for row in df['tokens']:
    tokens_no_sw.append([w for w in row if w not in sw])
  df['tokens'] = tokens_no_sw

remove_stopwords(train_data)
remove_stopwords(test_data)

### **(G)**

In [None]:
def remove_empty_string(df):
  tokens_no_empty = []
  for row in df['tokens']:
    tokens_no_empty.append([w for w in row if w != ''])
  df['tokens'] = tokens_no_empty

remove_empty_string(train_data)
remove_empty_string(test_data)

In [None]:
# Separate labels from features, converting to numpy arrays
X_train, y_train = train_data['tokens'].to_numpy(), train_data['Sentiment'].to_numpy()
X_test, y_test = test_data['tokens'].to_numpy(), test_data['Sentiment'].to_numpy()

X = np.concatenate([X_train, X_test])

In [None]:
print(f"Total vocabulary kength: {np.shape(np.unique(np.concatenate(X)))[0]}")

Total vocabulary kength: 78963


In [None]:
# Since we've already done some steps, we will use a dummy function to override these components.
def override_fcn(doc):
  # We expect a list of tokens as input
  return doc

# Count Vectorizer
count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    max_features = 3500)

count_vec.fit(X)
counts_train = count_vec.transform(X_train)
counts_test = count_vec.transform(X_test)
print("train data:")
print(counts_train.toarray())
print("\n test data:")
print(counts_test.toarray())

train data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

 test data:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### **(H)**

In [None]:
# Fit the Naive Bayes model to our training data
nb = MultinomialNB()
# Fit model to training data
nb.fit(counts_train, y_train)
y_train_preds = nb.predict(counts_train)

# Predict on test data
y_test_preds = nb.predict(counts_test)

print(f"Training accuracy: {accuracy_score(y_train, y_train_preds):.4f}")
print(f"Training error:    {1-accuracy_score(y_train, y_train_preds):.4f}")
print(f"Test accuracy:     {accuracy_score(y_test, y_test_preds):.4f}")
print(f"Test error:        {1-accuracy_score(y_test, y_test_preds):.4f}")

Training accuracy: 0.7298
Training error:    0.2702
Test accuracy:     0.6930
Test error:        0.3070


In [None]:
for label in range(3):
  flag = y_train == label
  tokens_label = X_train[flag]
  words, counts = np.unique(np.concatenate(tokens_label), return_counts=True)
  prob = counts / np.sum(counts)
  top5_idx = prob.argsort()[-5:][::-1]
  print(f"Sentiment {label}: {words[top5_idx]}")
  print(f"Counts {label}:    {counts[top5_idx]}")

Sentiment 0: ['coronaviru' 'covid19' 'price' 'food' 'thi']
Counts 0:    [6736 4610 4345 3638 3223]
Sentiment 1: ['coronaviru' 'covid19' 'store' 'supermarket' 'price']
Counts 1:    [3812 2566 1588 1441 1364]
Sentiment 2: ['coronaviru' 'covid19' 'store' 'thi' 'price']
Counts 2:    [7511 5682 3916 3781 3338]
