In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier


In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\tuann\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
positive_tweets = nltk.corpus.twitter_samples.strings('positive_tweets.json')
negative_tweets = nltk.corpus.twitter_samples.strings('negative_tweets.json')

training_tweets = positive_tweets[:100] + negative_tweets[:100]
test_tweets = positive_tweets[100:200] + negative_tweets[100:200]

In [None]:
with open("train.csv", "w", encoding="utf-8") as f:
    f.write("text,label\n")
    for tweet in training_tweets:
        label = "positive" if tweet in positive_tweets else "negative"
        f.write(f"\"{tweet}\",{label}\n")

with open("test.csv", "w", encoding="utf-8") as f:
    f.write("text,label\n")
    for tweet in test_tweets:
        f.write(f"\"{tweet}\",\n")   

In [5]:

train_labels = [1]*100 + [0]*100   # 1 = positive, 0 = negative
test_labels = [1]*100 + [0]*100

# Vector hóa dữ liệu text → bag-of-words
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(training_tweets)
X_test = vectorizer.transform(test_tweets)

# Huấn luyện Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, train_labels)

# Dự đoán
y_pred = clf.predict(X_test)

y_pred


array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0])

# ChatGPT Responses:


In [6]:
chatgpt_responses = pd.read_csv("gpt.csv")
chatgpt_responses

Unnamed: 0,text,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,negative
1,@AxeRade haw phela if am not looking like Mom ...,negative
2,@zaynmalik prince charming on stage :) x https...,negative
3,i have really good luck :),negative
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
196,Rest in peace mo :(,positive
197,@MHSScho not much chance of game tonight :( #t...,negative
198,@OL_EYY ahhh u see? U wouldn't wake up :(,positive
199,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [7]:
gpt_pred = chatgpt_responses['label'].map({'positive': 1, 'negative': 0}).values
gpt_pred

array([0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1], dtype=int64)

In [8]:
len(y_pred) < len(gpt_pred)

True

In [9]:
len(y_pred), len(gpt_pred)

(200, 201)

In [24]:
chatgpt = pd.DataFrame({
    "tweets": test_tweets
})

chatgpt = chatgpt.merge(chatgpt_responses[["text", "label"]],
                        left_on="tweets", right_on="text", how="left")

chatgpt = chatgpt.drop(columns=["text"])
chatgpt

Unnamed: 0,tweets,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,negative
1,@AxeRade haw phela if am not looking like Mom ...,negative
2,@zaynmalik prince charming on stage :) x https...,negative
3,i have really good luck :),negative
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
195,Rest in peace mo :(,positive
196,@MHSScho not much chance of game tonight :( #t...,negative
197,@OL_EYY ahhh u see? U wouldn't wake up :(,positive
198,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [31]:
gpt_pred = chatgpt['label'].map({'positive': 1, 'negative': 0}).values
gpt_pred

array([ 0.,  0.,  0.,  0.,  1.,  1.,  0., nan,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  0., nan, nan,  1.,  1.,  1., nan,  0.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,
        0.,  0.,  0.,  0., nan,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,
        0.,  1.,  1.,  1.,  1.,  0., nan,  1.,  0.,  0.,  1.,  1.,  1.,
       nan,  0.,  1.,  0.,  0., nan,  1.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1., nan,  1.,  1.,  1.,  1., nan,  1.,  1.,  1.,  1.,
       nan,  0.,  0.,  1.,  1., nan,  0.,  1.,  1.,  0.,  0., nan,  0.,
        0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
       nan,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  0., nan,  0.,  0.,  1.,  0., nan,  0.,  0.,  0.,  0

--> there is some missing values , these are mistakes from chatgpt --> we compare the leftovers by dropping the same positions in both arrays

In [None]:
mask = ~np.isnan(gpt_pred)

# filter both arrays
gpt_clean = gpt_pred[mask]
y_clean   = y_pred[mask]

if len(gpt_clean) == len(y_clean):
    matches = np.sum(gpt_clean == y_clean)
    total = len(gpt_clean)
    accuracy = matches / total

    print(f"Accuracy: {accuracy:.2%} ({matches}/{total})")

Accuracy: 77.47% (141/182)


# DeepSeek response


In [43]:
deepseek_responses = pd.read_csv("deepseek.csv", sep=",", quotechar='"', on_bad_lines="skip")
deepseek_responses

Unnamed: 0,text,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,positive
1,@AxeRade haw phela if am not looking like Mom ...,positive
2,@zaynmalik prince charming on stage :) x https...,positive
3,i have really good luck :),positive
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
196,Rest in peace mo :(,negative
197,@MHSScho not much chance of game tonight :( #t...,negative
198,@OL_EYY ahhh u see? U wouldn't wake up :(,negative
199,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [52]:
deepseek = pd.DataFrame({
    "tweets": test_tweets
})

deepseek = deepseek.merge(deepseek_responses[["text", "label"]],
                        left_on="tweets", right_on="text", how="left")

deepseek = deepseek.drop(columns=["text"])
deepseek

Unnamed: 0,tweets,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,positive
1,@AxeRade haw phela if am not looking like Mom ...,positive
2,@zaynmalik prince charming on stage :) x https...,positive
3,i have really good luck :),positive
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
195,Rest in peace mo :(,negative
196,@MHSScho not much chance of game tonight :( #t...,negative
197,@OL_EYY ahhh u see? U wouldn't wake up :(,negative
198,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [53]:
deepseek_pred = deepseek['label'].map({'positive': 1, 'negative': 0}).values
deepseek_pred

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1., nan, nan,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1., nan, nan,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1., nan,  1.,  1.,  1.,  1., nan,  1.,  1.,  1.,  1.,
       nan,  1.,  1.,  1.,  1.,  1.,  1.,  1., nan,  0.,  0., nan,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0., nan,  0.,  0.,  0.,  0.,  0.,  0.,
        0., nan,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., nan,  0.,  0.,
        0.,  0., nan,  0.,  0.,  0.,  0., nan,  0.,  0.,  0.,  0

that was the same issues happened with chatGPT -- > solve it the same way

In [None]:
mask = np.isnan(deepseek_pred)

# filter both arrays
deepseek_clean = deepseek_pred[mask]
y_clean   = y_pred[mask]

if len(deepseek_clean) == len(y_clean):
    matches = np.sum(deepseek_clean == y_clean)
    total = len(deepseek_clean)
    accuracy = matches / total

    print(f"Accuracy: {accuracy:.2%} ({matches}/{total})")

Accuracy: 66.67% (124/186)


# Grok Response

In [51]:
grok_responses = pd.read_csv("grok.csv", sep=",", quotechar='"', on_bad_lines="skip")
grok_responses

Unnamed: 0,text,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,positive
1,@AxeRade haw phela if am not looking like Mom ...,positive
2,@zaynmalik prince charming on stage :) x https...,positive
3,i have really good luck :),positive
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
194,Rest in peace mo :(,negative
195,@MHSScho not much chance of game tonight :( #t...,negative
196,@OL_EYY ahhh u see? U wouldn't wake up :(,negative
197,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [55]:
grok = pd.DataFrame({
    "tweets": test_tweets
})  

grok = grok.merge(grok_responses[["text", "label"]],
                        left_on="tweets", right_on="text", how="left")

grok = grok.drop(columns=["text"])
grok

Unnamed: 0,tweets,label
0,@metalgear_jp @Kojima_Hideo I want you're T-sh...,positive
1,@AxeRade haw phela if am not looking like Mom ...,positive
2,@zaynmalik prince charming on stage :) x https...,positive
3,i have really good luck :),positive
4,Stats for the day have arrived. 1 new follower...,positive
...,...,...
195,Rest in peace mo :(,negative
196,@MHSScho not much chance of game tonight :( #t...,negative
197,@OL_EYY ahhh u see? U wouldn't wake up :(,negative
198,Mumma! Im on 7% :( @LMCDark_Angel,negative


In [56]:
grok_pred = grok['label'].map({'positive': 1, 'negative': 0}).values
grok_pred

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1., nan,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1., nan, nan,  1., nan,  1., nan,  1.,  1.,  0.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1., nan,  1.,  1.,  1., nan,  1.,  1., nan,  1.,
        1.,  1.,  1.,  1.,  1.,  1., nan, nan,  1.,  1., nan,  1.,  1.,
       nan,  1.,  1.,  1.,  1., nan,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1., nan,  1., nan,  1.,  1., nan,  1.,  1.,  1.,  1.,
       nan,  1.,  1.,  1.,  1., nan,  1.,  1., nan,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0., nan,  0.,  0.,  0.,  0., nan,  0.,
       nan, nan,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., nan,  0.,  0.,
        0.,  0., nan,  0.,  0.,  0.,  0., nan,  0.,  0.,  0.,  0

In [57]:
mask = ~np.isnan(grok_pred)
# filter both arrays
grok_clean = grok_pred[mask]    
y_clean   = y_pred[mask]

if len(grok_clean) == len(y_clean):
    matches = np.sum(grok_clean == y_clean)
    total = len(grok_clean)
    accuracy = matches / total

    print(f"Accuracy: {accuracy:.2%} ({matches}/{total})")

Accuracy: 66.67% (114/171)
