In [1]:
import numpy as np
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import time
import re
import string
import math

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [3]:
def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')

    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) #the tokenizer will downcase everything except for emoticons
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and   # remove stopwords
                word not in string.punctuation): # remove punctuation
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

In [4]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    # start with an empty dict and populate it by looping over all tweets
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

freqs = build_freqs(train_x, train_y)

In [5]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        if (word, 1) in freqs.keys():
            x[0,1] += freqs[(word, 1)]
        
        # increment the word count for the negative label 0
        if (word, 0) in freqs.keys():
            x[0,2] += freqs[(word, 0)]
        
    assert(x.shape == (1, 3))
    return x

In [6]:
def sigmoid(z): 
    h = 1. / (1. + np.exp(-z))
    return h

In [7]:
def gradient_descent_logistic(x, y, theta, alpha, num_iters):
    # get 'm', the number of rows in matrix X
    m = len(x)
    losses = []
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x, theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = - (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h))) / float(m)
        losses.append(float(J))
        # update the weights theta
        theta = theta - (alpha * np.dot(x.T, (h-y))) / float(m)
    
    J = float(J)

    return J, theta

In [8]:
def predict_tweet(tweet, freqs, theta):
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

In [9]:
def predict_logistic(X, w):
    z = np.dot(X, w)
    h = sigmoid(z)
    return (h >= 0.5).astype(int)

In [10]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

X_test = np.zeros((len(test_x), 3))
for i in range(len(test_x)):
    X_test[i, :] = extract_features(test_x[i], freqs)
Y_test = np.squeeze(test_y)

***Task 1***

***Custom Logistic Regression:***

In [11]:
start_time = time.time()

J, w = gradient_descent_logistic(X, Y, np.zeros((3, 1)), 1e-9, 10000)
y_pred_custom = predict_logistic(X_test, w)

time_custom = time.time() - start_time

accuracy_custom = accuracy_score(test_y, y_pred_custom)
precision_custom = precision_score(test_y, y_pred_custom)
recall_custom = recall_score(Y_test, y_pred_custom)
f1_custom = f1_score(Y_test, y_pred_custom)
roc_auc_custom = roc_auc_score(test_y, y_pred_custom)

  losses.append(float(J))
  J = float(J)


***Sklearn Logistic Regression:***

In [12]:
Y = np.squeeze(train_y)
start_time = time.time()

clf = LogisticRegression()
clf.fit(X, Y)

y_pred_sklearn = clf.predict(X_test)

time_sklearn = time.time() - start_time

accuracy_sklearn = accuracy_score(Y_test, y_pred_sklearn)
precision_sklearn = precision_score(Y_test, y_pred_sklearn)
recall_sklearn = recall_score(Y_test, y_pred_sklearn)
f1_sklearn = f1_score(Y_test, y_pred_sklearn)
roc_auc_sklearn = roc_auc_score(Y_test, y_pred_custom)

***Comparison:***

In [13]:
print(f"Custom : Accuracy = {accuracy_custom:.4f}, "
      f"Precision = {precision_custom:.4f}, "
      f"Recall = {recall_custom:.4f}, "
      f"F1 = {f1_custom:.4f}, "
      f"ROC-AUC = {roc_auc_custom:.4f}, "
      f"Time = {time_custom:.4f} seconds")

print(f"Sklearn: Accuracy = {accuracy_sklearn:.4f}, "
      f"Precision = {precision_sklearn:.4f}, "
      f"Recall = {recall_sklearn:.4f}, "
      f"F1 = {f1_sklearn:.4f}, "
      f"ROC-AUC = {roc_auc_sklearn:.4f}, "
      f"Time = {time_sklearn:.4f} seconds")

Custom : Accuracy = 0.9960, Precision = 0.9930, Recall = 0.9990, F1 = 0.9960, ROC-AUC = 0.9960, Time = 4.8870 seconds
Sklearn: Accuracy = 0.9950, Precision = 0.9920, Recall = 0.9980, F1 = 0.9950, ROC-AUC = 0.9960, Time = 0.0397 seconds


- Custom logistic regression model:
This model does not apply regularization, so the weights are optimized more closely to the training data. As a result, the metrics such as Accuracy, Precision, Recall, and F1 are slightly higher compared to the library-based model. However, the drawback is a higher risk of overfitting when applied to unseen data, since the model tends to fit too tightly to the training set.

- Scikit-learn logistic regression model:
This model uses L2 regularization by default to prevent overfitting and improve the model’s generalization ability. Due to the presence of regularization, some metrics such as Accuracy, Precision, Recall, and F1 are slightly lower than those of the custom model. On the other hand, scikit-learn has a clear advantage in training speed and stability, especially when working with large datasets.

***Task 6***

In [14]:
def extract_6features(tweet, freqs, process_tweet=process_tweet):
    x = np.zeros((1, 7))
    x[0,0] = 1

    pronouns = {"i", "me", "my", "mine", "myself",
            "we", "us", "our", "ours", "ourselves",
            "you", "your", "yours", "yourself", "yourselves"}

    word_1 = process_tweet(tweet)
    
    # x1, x2
    for word in word_1:
        if (word, 1) in freqs.keys():
            x[0,1] += freqs[(word, 1)]
        
        if (word, 0) in freqs.keys():
            x[0,2] += freqs[(word, 0)]
    
    # x3
    if re.search(r"\bno\b", tweet.lower()):
        x[0,3] = 1
    
    # x4
    tokens_raw = re.findall(r"\w+", tweet.lower())
    x[0,4] = sum(1 for t in tokens_raw if t in pronouns)
    
    # x5
    if "!" in tweet:
        x[0,5] = 1
    
    # x6
    word_count = len(tokens_raw)
    if word_count > 0:
        x[0,6] = math.log(word_count)
    
    assert(x.shape == (1, 7))
    return x

In [15]:
X_6features = np.zeros((len(train_x), 7))
for i in range(len(train_x)):
    X_6features[i, :]= extract_6features(train_x[i], freqs)
Y_6features = np.squeeze(train_y)

X_test_6features = np.zeros((len(test_x), 7))
for i in range(len(test_x)):
    X_test_6features[i, :] = extract_6features(test_x[i], freqs)
Y_test_6features = np.squeeze(test_y)

In [16]:
start_time = time.time()
clf = LogisticRegression()
clf.fit(X_6features, Y_6features)
y_pred = clf.predict(X_test_6features)
time = time.time() - start_time
accuracy = accuracy_score(Y_test_6features, y_pred)
precision = precision_score(Y_test_6features, y_pred)
recall = recall_score(Y_test_6features, y_pred)
f1 = f1_score(Y_test_6features, y_pred)
roc_auc = roc_auc_score(Y_test_6features, y_pred)

In [17]:
print(f"Accuracy = {accuracy:.4f}, "
      f"Precision = {precision:.4f}, "
      f"Recall = {recall:.4f}, "
      f"F1 = {f1:.4f}, "
      f"ROC-AUC = {roc_auc:.4f}, "
      f"Time = {time:.4f} seconds")

Accuracy = 0.9940, Precision = 0.9911, Recall = 0.9970, F1 = 0.9940, ROC-AUC = 0.9940, Time = 0.0628 seconds


- With two features, the model achieved:
Accuracy = 0.9950, Precision = 0.9920, Recall = 0.9980, F1 = 0.9950, and ROC-AUC = 0.9960. 
- When four additional features were introduced, the performance slightly dropped to: 
Accuracy = 0.9940, Precision = 0.9911, Recall = 0.9970, F1 = 0.9940, and ROC-AUC = 0.9940. 
- This decrease occurred because the extra features added complexity but contributed little useful information, introducing noise. Moreover, scikit-learn’s default L2 regularization penalized unnecessary weights, which reduced overfitting but also led to slightly lower precision and recall.