In [1]:
# data from twitter
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

import re, string, random

In [2]:
def is_contain_chinese(str):
    for _char in str:
        if '\u4e00' <= _char <= '\u9fa5':
            return True
    return False

def is_contain_english(str):
    for _char in str:
        if 'a' <= _char <= 'z' or 'A' <= _char <= 'Z':
            return True
    return False


In [3]:
def remove_noise(sentence_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(sentence_tokens):

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_sentence_for_model(cleaned_tokens_list):
    for sentence_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in sentence_tokens)


In [4]:
import pandas as pd
df = pd.read_csv('data_labelled.csv', header = 3, nrows = 400)

# English
e_df = df.copy()
e_index = [x for x in range(len(df)) if is_contain_chinese(str(df.iloc[x, 1]))]
e_df.drop(e_index, inplace=True)
print(e_index)
print(len(df))

# Hybrid
h_df = df.copy()
h_index = [x for x in range(len(df)) if not (is_contain_english(str(df.iloc[x, 1])) and is_contain_chinese(str(df.iloc[x, 1]))) ]
h_df.drop(h_index, inplace=True)
print(h_index)

# Chinese
c_df = df.copy()
c_index = [x for x in range(len(df)) if is_contain_english(str(df.iloc[x, 1]))]
c_df.drop(c_index, inplace=True)
print(c_index)

print(len(e_df))

[0, 1, 22, 25, 30, 36, 38, 39, 40, 43, 45, 46, 47, 48, 51, 55, 56, 57, 58, 59, 60, 68, 73, 76, 86, 87, 93, 94, 96, 100, 102, 108, 110, 113, 115, 116, 118, 120, 122, 123, 125, 129, 132, 135, 136, 138, 140, 141, 144, 149, 152, 155, 156, 157, 159, 160, 161, 162, 167, 171, 172, 173, 175, 176, 184, 187, 188, 189, 190, 191, 198, 199, 200, 203, 204, 206, 208, 233, 236, 237, 239, 240, 242, 248, 249, 255, 260, 263, 265, 271, 280, 283, 287, 295, 308, 313, 316, 317, 319, 323, 324, 325, 326, 329, 342, 349, 354, 357, 358, 362, 364, 368, 371, 374, 380, 382, 383, 384, 385, 387, 388, 393, 395, 397, 398, 399]
400
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 97, 98, 99, 100, 101, 102, 103, 104, 

In [5]:
df = e_df.iloc[:,1].tolist()
#print(df)
pos_df = [df[x] for x in range(len(df)) if e_df.iloc[x,2] == 1]
neg_df = [df[x] for x in range(len(df)) if e_df.iloc[x,2] == -1]


stop_words = stopwords.words('english')

pos_tokens = [word_tokenize(sentence) for sentence in pos_df ]
neg_tokens = [word_tokenize(sentence) for sentence in neg_df ]

pos_cleaned_tokens_list = []
neg_cleaned_tokens_list = []

for tokens in pos_tokens:
    pos_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in neg_tokens:
    neg_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(pos_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_sentence_for_model(pos_cleaned_tokens_list)
negative_tokens_for_model = get_sentence_for_model(neg_cleaned_tokens_list)

positive_dataset = [(sentence_dict, "Positive") for sentence_dict in positive_tokens_for_model]

negative_dataset = [(sentence_dict, "Negative") for sentence_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset


random.shuffle(dataset)

train_data = dataset[:220]
test_data = dataset[220:]
print(len(train_data))
print(test_data)

[('good', 11), ('nil', 11), ('none', 8), ('nothing', 6), ('much', 5), ('think', 4), ('nope', 3), ("n't", 3), ('know', 3), ('teaching', 2)]
220
[({}, 'Positive'), ({'may': True, 'include': True, 'exercise': True, 'session': True, 'lesson': True}, 'Negative'), ({'interaction': True, 'student': True, 'teacher': True}, 'Negative'), ({'would': True, 'good': True, 'lecture': True, 'recording': True, 'provide': True}, 'Negative'), ({'video': True, 'assignment': True, 'harsh': True, 'online': True, 'teaching': True}, 'Negative'), ({'system': True, 'zoom': True, 'maybe': True, 'secure': True, 'online': True, 'corporation/meeting': True, 'software': True, 'like': True, 'microsoft': True, 'teams': True, 'use': True}, 'Negative'), ({'interaction': True, 'student': True}, 'Negative'), ({'interactive': True, 'activity': True}, 'Negative'), ({}, 'Positive'), ({'except': True, 'internet': True, 'problem': True, 'teacher': True, "'s": True, 'supervision': True, 'somehow': True, 'weak': True}, 'Negative

In [4]:
import pandas as pd
p_df = pd.read_csv('Good features.csv', header = 0, nrows = 900)
n_df = pd.read_csv('Improvement.csv', header = 0, nrows = 700)

# English
p_e_df = p_df.copy()
p_e_index = [x for x in range(len(p_df)) if is_contain_chinese(str(p_df.iloc[x, 0]))]
p_e_df.drop(p_e_index, inplace=True)
#print(p_e_df)

# Hybrid
p_h_df = p_df.copy()
p_h_index = [x for x in range(len(p_df)) if not (is_contain_english(str(p_df.iloc[x, 0])) and is_contain_chinese(str(p_df.iloc[x, 0]))) ]
p_h_df.drop(p_h_index, inplace=True)
#print(p_h_index)

# Chinese
p_c_df = p_df.copy()
p_c_index = [x for x in range(len(p_df)) if is_contain_english(str(p_df.iloc[x, 0]))]
p_c_df.drop(p_c_index, inplace=True)
#print(p_c_index)

#print(len(p_e_df))

# English
n_e_df = n_df.copy()
n_e_index = [x for x in range(len(n_df)) if is_contain_chinese(str(n_df.iloc[x, 0]))]
n_e_df.drop(n_e_index, inplace=True)
#print(n_e_df)

# Hybrid
n_h_df = n_df.copy()
n_h_index = [x for x in range(len(n_df)) if not (is_contain_english(str(n_df.iloc[x, 0])) and is_contain_chinese(str(n_df.iloc[x, 0]))) ]
n_h_df.drop(n_h_index, inplace=True)
#print(n_h_index)

# Chinese
n_c_df = n_df.copy()
n_c_index = [x for x in range(len(n_df)) if is_contain_english(str(n_df.iloc[x, 0]))]
n_c_df.drop(n_c_index, inplace=True)
#print(n_c_index)

#print(len(n_e_df))

In [120]:
pos_df = p_e_df.iloc[:,0].tolist()
neg_df = n_e_df.iloc[:,0].tolist()
#print(pos_df)
#print(neg_df)


stop_words = stopwords.words('english')

pos_tokens = [word_tokenize(sentence) for sentence in pos_df ]
neg_tokens = [word_tokenize(sentence) for sentence in neg_df ]

pos_cleaned_tokens_list = []
neg_cleaned_tokens_list = []

for tokens in pos_tokens:
    pos_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in neg_tokens:
    neg_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(pos_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_sentence_for_model(pos_cleaned_tokens_list)
negative_tokens_for_model = get_sentence_for_model(neg_cleaned_tokens_list)

positive_dataset = [(sentence_dict, "Positive") for sentence_dict in positive_tokens_for_model]

negative_dataset = [(sentence_dict, "Negative") for sentence_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset


random.shuffle(dataset)
for data in dataset:
    if len(data[0]) == 0:
        dataset.remove(data)
    if len(data[0]) == 1:
        if data[0] == {'none': True} or data[0] == {'nil': True} or data[0] == {'nothing': True} or data[0] == {'nope': True} or data[0] == {'N/A': True} or data[0] == {'n/a': True}:
            dataset.remove(data)
print(len(dataset))
train_data = dataset[:900]
random.shuffle(train_data)
dev_test_data = dataset[850:1050]
random.shuffle(dev_test_data)
test_data = dataset[1050:]
print(len(train_data))
print(len(test_data))
#print(test_data)
'''
for test in test_data:
    if len(test[0]) == 1:
        if test[0] == {'none': True} or test[0] == {'nil': True} or test[0] == {'nothing': True} or test[0] == {'nope': True} or test[0] == {'N/A': True}:
            test_data.remove(test)
'''
print(len(test_data))
#print(test_data)

[('lecture', 123), ('time', 106), ('student', 100), ('good', 93), ('course', 89), ('question', 88), ('online', 85), ('video', 83), ('class', 78), ('easy', 65)]
1214
900
164
164


In [130]:
classifier = NaiveBayesClassifier.train(train_data)
print(classify.accuracy(classifier, dev_test_data))
for i in dev_test_data:
    if classifier.classify(i[0]) != i[1]:
        print(i)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))
acc = 0
ans = ""
for i in test_data:
    if i[0] == {}:
        ans = "Positive"
    else:
        ans = classifier.classify(i[0])
    if  ans == i[1]:
        acc = acc + 1
    else:
        print(i)
print(acc/(len(test_data)))

0.795
({'like': True, 'normal': True, 'class': True, 'sometimes': True, 'good': True, 'cuz': True, 'may': True, 'want': True, 'spend': True, 'time': True, 'think': True, 'question': True, 'along': True, 'turn': True, 'column': True, 'microphone': True, 'real': True, 'hard': True, 'achieve': True, 'however': True, 'discuss': True, 'zoom': True}, 'Positive')
({'video': True, 'website': True, 'like': True, 'youtube': True, 'share': True}, 'Positive')
({'n/a': True}, 'Negative')
({'appraisal': True, 'method': True, 'become': True, 'presentation': True, 'help': True, 'exercise': True, 'cooperation': True, 'expression': True, 'ability': True}, 'Positive')
({}, 'Negative')
({'timely': True, 'feedback': True, 'communication': True, 'prof': True, 'student': True}, 'Positive')
({'perform': True, 'online': True, 'talk': True, 'let': True, 'us': True, 'know': True, 'stuff': True}, 'Positive')
({'mate': True, 'system': True, 'cripple': True, 'online': True, 'teaching': True, 'students': True, 'relu

In [75]:
from nltk.sentiment import SentimentIntensityAnalyzer

acc = 0
sia = SentimentIntensityAnalyzer()
for x in dataset:
    res = sia.polarity_scores(str(x))
    comp = res['compound']
    del res['compound']
    #print(x, '\n', 'result:', max(res, key=res.get),'\n', 'compound:', comp, '\n')
    if (max(res, key=res.get) == "pos" and x[1] == "Positive"): #or (max(res, key=res.get) == "neu" and x[1] == "Negative"):
        acc = acc + 1
        #print(x)
print(acc)
print(len(pos_df))
print(len(dataset))
count = 0
for a in dataset:
    if a[1] == "Positive":
        count = count + 1
print(count)
print(acc/len(dataset))

547
766
1287
731
0.425019425019425


In [85]:
count = 0
for x in pos_df:
    #print(x)
    test_sen = x
    test_tokens = remove_noise(word_tokenize(test_sen))
    #print(test_sen,'\n', classifier.classify(dict([token, True] for token in test_tokens)))
    if classifier.classify(dict([token, True] for token in test_tokens)) == "Positive":
        count = count + 1
print(count/len(pos_df))

count = 0
for x in neg_df:
    #print(x)
    test_sen = x
    test_tokens = remove_noise(word_tokenize(test_sen))
    #print(test_sen,'\n', classifier.classify(dict([token, True] for token in test_tokens)))
    if classifier.classify(dict([token, True] for token in test_tokens)) == "Negative":
        count = count + 1
print(count/len(pos_df))

test_sen = "none"
test_tokens = remove_noise(word_tokenize(test_sen))

print(test_sen,'\n', classifier.classify(dict([token, True] for token in test_tokens)))

0.8407310704960835
0.6540469973890339
none 
 Negative


In [11]:
# tweet training set

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

all_pos_words = get_all_words(positive_cleaned_tokens_list)

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_sentence_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_sentence_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

custom_tweet = "everything is good"

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(custom_tweet, '\n', classifier.classify(dict([token, True] for token in custom_tokens)))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.996
Most Informative Features
                      :( = True           Negati : Positi =   2080.2 : 1.0
                      :) = True           Positi : Negati =   1637.1 : 1.0
                     sad = True           Negati : Positi =     34.4 : 1.0
                follower = True           Positi : Negati =     22.2 : 1.0
                     bam = True           Positi : Negati =     21.9 : 1.0
                followed = True           Negati : Positi =     20.7 : 1.0
                  arrive = True           Positi : Negati =     18.7 : 1.0
           @justinbieber = True           Negati : Positi =     18.2 : 1.0
               community = True           Positi : Negati =     15.4 : 1.0
                 welcome = True           Positi : Negati =     13.3 : 1.0
None
everything is good 
 Positive
