In [31]:

import numpy as np
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from collections import defaultdict
import time
import re
import string

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
def process_tweet(tweet):
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english') 
    
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and 
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean


In [33]:
def build_freqs(tweets, ys):
    freqs = defaultdict(int)
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            freqs[(word, y[0])] += 1
    return freqs

# Load dữ liệu twitter_samples
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

train_x = all_positive_tweets[:4000] + all_negative_tweets[:4000]
train_y = np.append(np.ones((4000,1)), np.zeros((4000,1)), axis=0)

freqs = build_freqs(train_x, train_y)


In [34]:

def extract_features(tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    for word in word_l:
        x[0,1] += freqs.get((word,1.0), 0)
        x[0,2] += freqs.get((word,0.0), 0)
    return x


In [35]:
def sigmoid(z): 
    return 1 / (1 + np.exp(-z))


In [36]:
def gradient_descent_logistic(X, y, w, alpha, num_iters=100):
    m = X.shape[0]
    for i in range(num_iters):
        z = np.dot(X, w)
        h = sigmoid(z)
        J = -1/m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))
        w -= alpha/m * np.dot(X.T, (h-y))
    return J, w

In [37]:
def predict_logistic(X, w):
    z = np.dot(X, w)
    h = sigmoid(z)
    return (h >= 0.5).astype(int)

In [38]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)
Y = train_y

***Custom Logistic Regression:***

In [39]:
start_time = time.time()

J, w = gradient_descent_logistic(X, Y, np.zeros((3, 1)), alpha=1e-9, num_iters=1500)
y_pred_custom = predict_logistic(X, w)
precision_custom = precision_score(Y, y_pred_custom)

time_custom = time.time() - start_time

***Sklearn Logistic Regression:***

In [40]:
start_time = time.time()

clf = LogisticRegression(max_iter=1000)
clf.fit(X, Y.ravel())

y_pred_sklearn = clf.predict(X)
precision_sklearn = precision_score(Y, y_pred_sklearn)

time_sklearn = time.time() - start_time

***Comparison:***

In [41]:
print(f"Custom: Precision = {precision_custom:.4f}, Time = {time_custom:.4f} seconds")
print(f"Sklearn: Precision = {precision_sklearn:.4f}, Time = {time_sklearn:.4f} seconds")

Custom: Precision = 0.9945, Time = 0.6402 seconds
Sklearn: Precision = 0.9845, Time = 0.0165 seconds
