In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from collections import defaultdict
import time
import re
import string

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/klinhfhm/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/klinhfhm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [418]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [419]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)


In [420]:
def process_tweet(tweet):
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english') 
    
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and 
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean


In [421]:
def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    # start with an empty dict and populate it by looping over all tweets
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

freqs = build_freqs(train_x, train_y)

In [422]:
	
def extract_features(tweet, freqs, process_tweet=process_tweet):
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_tweet(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        if (word, 1) in freqs.keys():
            x[0,1] += freqs[(word, 1)]
        
        # increment the word count for the negative label 0
        if (word, 0) in freqs.keys():
            x[0,2] += freqs[(word, 0)]
        
    assert(x.shape == (1, 3))
    return x

In [423]:
def sigmoid(z): 
    return 1 / (1 + np.exp(-z))

In [424]:
def gradient_descent_logistic(x, y, theta, alpha, num_iters):
    m = len(x)
    losses = []
    for i in range(num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)
        
        # avoid log(0) → clip h
        h = np.clip(h, 1e-15, 1 - 1e-15)
        
        J = - (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h))) / m 
        losses.append(J.item())   # store as clean float
        
        theta = theta - (alpha * np.dot(x.T, (h-y))) / m
    return losses, theta

In [425]:
def predict_tweet(tweet, freqs, theta):
    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred
def predict_logistic(X, w):
    z = np.dot(X, w)
    h = sigmoid(z)
    return (h >= 0.5).astype(int)

In [426]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y

X_test = np.zeros((len(test_x), 3))
for i in range(len(test_x)):
    X_test[i, :] = extract_features(test_x[i], freqs)
Y_test = np.squeeze(test_y)

**TASK 4**

In [427]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, precision_score

scalers = {
    "Baseline (No Scaler)": None,
    "MinMax": MinMaxScaler(),
    "Standard": StandardScaler(),
    "Robust": RobustScaler()
}

for name, scaler in scalers.items():
    print(f"\n{name}")
    
    if scaler:
        X_train_s = np.hstack([np.ones((X.shape[0], 1)), scaler.fit_transform(X[:, 1:])])
        X_test_s  = np.hstack([np.ones((X_test.shape[0], 1)), scaler.transform(X_test[:, 1:])])
    else:
        X_train_s, X_test_s = X, X_test
    
    w_init = np.zeros((X_train_s.shape[1], 1))
    losses, w = gradient_descent_logistic(X_train_s, Y, w_init, alpha=1e-9, num_iters=20000)
    
    y_pred = predict_logistic(X_test_s, w)
    print(f"Final Loss: {losses[-1]:.4f}")
    print(f"Accuracy: {accuracy_score(Y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(Y_test, y_pred):.4f}")



Baseline (No Scaler)
Final Loss: 0.0964
Accuracy: 0.9960
Precision: 0.9930

MinMax
Final Loss: 0.6931
Accuracy: 0.9825
Precision: 0.9979

Standard
Final Loss: 0.6931
Accuracy: 0.9695
Precision: 0.9425

Robust
Final Loss: 0.6931
Accuracy: 0.9935
Precision: 0.9980


**No Scaler:** Accuracy = 0.9960 → This is the highest score, showing that raw word frequency features are already well scaled for logistic regression. The model can separate positive vs. negative tweets effectively without normalization.

**Min-Max Scaler:** Accuracy = 0.9825 → Accuracy drops compared to the baseline. By compressing all feature values into the [0,1] range, MinMax scaling reduces the separation between tweets with many emotional words and those with fewer. This weakens the discriminative power of the decision boundary.

**Standard Scaler:** Accuracy = 0.9695 → Performance decreases further. Since word count features are not normally distributed, standardization distorts their distribution and reduces model performance.

**Robust Scaler:** Accuracy = 0.9935 → Very close to the baseline. Because the dataset does not contain significant outliers in word frequencies, median/IQR-based scaling offers little extra benefit but also does not harm performance.

**TASK 7**

**Test with 2 features**

In [429]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": BernoulliNB()
}

results = []

for name, model in models.items():
    # scale features for models sensitive to magnitude
    if name in ["Logistic Regression", "SVM", "KNN", "MLP Neural Net"]:
        clf = Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("model", model)
        ])
    else:
        clf = model

    clf.fit(X, Y)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(Y_test, y_pred)
    prec = precision_score(Y_test, y_pred, zero_division=0)

    results.append([name, acc, prec])

df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision"])
df_results = df_results.sort_values(by="Precision", ascending=False).reset_index(drop=True)

print(df_results)
print("\nBest model:", df_results.iloc[0]["Model"], "→ Precision =", df_results.iloc[0]["Precision"])


                 Model  Accuracy  Precision
0        Random Forest    0.9925   0.992993
1        Decision Tree    0.9895   0.990973
2                  SVM    0.9930   0.988119
3  Logistic Regression    0.9930   0.987154
4          Naive Bayes    0.5050   0.857143

Best model: Random Forest → Precision = 0.992992992992993


**Test with 6 features**

In [430]:
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import math
import time

def extract_6features(tweet, freqs, process_tweet=process_tweet):
    x = np.zeros((1, 7))
    x[0,0] = 1

    pronouns = {"i", "me", "my", "mine", "myself",
            "we", "us", "our", "ours", "ourselves",
            "you", "your", "yours", "yourself", "yourselves"}

    word_1 = process_tweet(tweet)
    
    # x1, x2
    for word in word_1:
        if (word, 1) in freqs.keys():
            x[0,1] += freqs[(word, 1)]
        
        if (word, 0) in freqs.keys():
            x[0,2] += freqs[(word, 0)]
    
    # x3
    if re.search(r"\bno\b", tweet.lower()):
        x[0,3] = 1
    
    # x4
    tokens_raw = re.findall(r"\w+", tweet.lower())
    x[0,4] = sum(1 for t in tokens_raw if t in pronouns)
    
    # x5
    if "!" in tweet:
        x[0,5] = 1
    
    # x6
    word_count = len(tokens_raw)
    if word_count > 0:
        x[0,6] = math.log(word_count)
    
    assert(x.shape == (1, 7))
    return x
X_6features = np.zeros((len(train_x), 7))

for i in range(len(train_x)):
    X_6features[i, :]= extract_6features(train_x[i], freqs)
Y_6features = np.squeeze(train_y)
X_test_6features = np.zeros((len(test_x), 7))

for i in range(len(test_x)):
    X_test_6features[i, :] = extract_6features(test_x[i], freqs)
Y_test_6features = np.squeeze(test_y)
start_time = time.time()
clf = LogisticRegression()
clf.fit(X_6features, Y_6features)
y_pred = clf.predict(X_test_6features)
time = time.time() - start_time
accuracy = accuracy_score(Y_test_6features, y_pred)
precision = precision_score(Y_test_6features, y_pred)
recall = recall_score(Y_test_6features, y_pred)
f1 = f1_score(Y_test_6features, y_pred)
roc_auc = roc_auc_score(Y_test_6features, y_pred)

print(f"Accuracy = {accuracy:.4f}, "
      f"Precision = {precision:.4f}, "
      f"Recall = {recall:.4f}, "
      f"F1 = {f1:.4f}, "
      f"ROC-AUC = {roc_auc:.4f}, "
      f"Time = {time:.4f} seconds")

Accuracy = 0.9940, Precision = 0.9901, Recall = 0.9980, F1 = 0.9940, ROC-AUC = 0.9940, Time = 0.0099 seconds


In [432]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time
import numpy as np

# List of models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": BernoulliNB(),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

results = []

for name, model in models.items():
    start_time = time.time()
    
    # Scale only for models sensitive to magnitude
    if name in ["Logistic Regression", "SVM"]:
        clf = Pipeline([
            ("scaler", StandardScaler(with_mean=False)),
            ("model", model)
        ])
    else:
        clf = model

    clf.fit(X_6features, Y_6features)
    y_pred = clf.predict(X_test_6features)
    
    elapsed_time = time.time() - start_time
    accuracy = accuracy_score(Y_test_6features, y_pred)
    precision = precision_score(Y_test_6features, y_pred, zero_division=0)
    
    results.append([name, accuracy, precision])

# Convert to DataFrame
import pandas as pd
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision"])
df_results = df_results.sort_values(by="Precision", ascending=False).reset_index(drop=True)

print(df_results)
print("\nBest model:", df_results.iloc[0]["Model"], "→ Precision =", df_results.iloc[0]["Precision"])


                 Model  Accuracy  Precision
0        Decision Tree    0.9910   0.992972
1        Random Forest    0.9920   0.990040
2  Logistic Regression    0.9940   0.988142
3                  SVM    0.9930   0.987154
4          Naive Bayes    0.5765   0.723032

Best model: Decision Tree → Precision = 0.9929718875502008
