In [139]:
import numpy as np
import nltk
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from collections import defaultdict
import time
import re
import string

nltk.download('twitter_samples')
nltk.download('stopwords')


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/klinhfhm/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/klinhfhm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [140]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)


In [141]:
def process_tweet(tweet):
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english') 
    
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and 
            word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)
    return tweets_clean


In [142]:
def build_freqs(tweets, ys):
    freqs = defaultdict(int)
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            freqs[(word, y[0])] += 1
    return freqs

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

train_x = all_positive_tweets[:4000] + all_negative_tweets[:4000]
train_y = np.append(np.ones((4000,1)), np.zeros((4000,1)), axis=0)

freqs = build_freqs(train_x, train_y)


In [143]:

def extract_features(tweet, freqs):
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 
    for word in word_l:
        x[0,1] += freqs.get((word,1.0), 0)
        x[0,2] += freqs.get((word,0.0), 0)
    return x


In [144]:
def sigmoid(z): 
    return 1 / (1 + np.exp(-z))


In [145]:
def gradient_descent_logistic(X, y, w, alpha, num_iters=100):
    m = X.shape[0]
    for i in range(num_iters):
        z = np.dot(X, w)
        h = sigmoid(z)
        J = -1/m * (np.dot(y.T, np.log(h)) + np.dot((1-y).T, np.log(1-h)))
        w -= alpha/m * np.dot(X.T, (h-y))
    return J, w

In [146]:
def predict_logistic(X, w):
    z = np.dot(X, w)
    h = sigmoid(z)
    return (h >= 0.5).astype(int)

In [147]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)
Y = train_y

**TASK 1**

In [148]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

scalers = {
    "No Scaler": None,
    "MinMax": MinMaxScaler(),
    "Standard": StandardScaler(),
    "Robust": RobustScaler()
}

for name, scaler in scalers.items():
    if scaler:
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = X
    
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_scaled, Y.ravel())
    y_pred = clf.predict(X_scaled)
    precision = precision_score(Y, y_pred)
    print(f"{name}: Precision = {precision:.4f}")



No Scaler: Precision = 0.9845
MinMax: Precision = 0.8972
Standard: Precision = 0.9796
Robust: Precision = 0.9773


In [149]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# Giả sử X, Y đã có sẵn và Y là 1D
# Nếu Y đang ở dạng (n,1): ép về 1D
Y = Y.ravel()

# Loại bỏ cột bias (nếu có). Giả sử bias là cột 0:
X_feats = X[:, 1:]   # giữ chỉ các đặc trưng thực (pos_count, neg_count, ...)

# Tạo train/test split (stratify để giữ cân bằng nhãn)
X_train, X_test, y_train, y_test = train_test_split(
    X_feats, Y, test_size=0.2, random_state=42, stratify=Y
)

scalers = {
    "No Scaler": None,
    "MinMax": MinMaxScaler(),
    "Standard": StandardScaler(),
    "Robust": RobustScaler()
}
results = {}
for name, scaler in scalers.items():
    if scaler is None:
        X_train_s = X_train.copy()
        X_test_s  = X_test.copy()
    else:
        # Fit only on train to avoid leakage
        scaler.fit(X_train)
        X_train_s = scaler.transform(X_train)
        X_test_s  = scaler.transform(X_test)

    # Train model (Logistic Regression)
    clf = LogisticRegression(max_iter=1000, random_state=42)
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_test_s)
    precision = precision_score(y_test, y_pred)
    results[name] = precision
    print(f"{name}: Precision = {precision:.4f}")

No Scaler: Precision = 0.9926
MinMax: Precision = 0.9019
Standard: Precision = 0.9864
Robust: Precision = 0.9816


**No Scaler:** Precision = 0.9845
→ This is the highest score, indicating that raw word frequency features are already on a stable and comparable scale. Logistic Regression can directly exploit them without normalization.

**Min-Max Scaler:** Precision = 0.8972
→ Precision drops significantly. By compressing all feature values into the [0,1] range, MinMax scaling reduces the contrast between tweets with many positive/negative words and those with only a few. As a result, the decision boundary becomes less discriminative.

**Standard Scaler:** Precision = 0.9796
→ Performance is close to the unscaled baseline. Even though word counts are not normally distributed, standardization preserves the relative proportions between positive and negative frequencies.

**Robust Scaler:** Precision = 0.9773
→ Also close to the baseline. Since the dataset does not contain extreme outliers in word counts, the median/IQR scaling does not bring additional benefits.


**TASK 7**

In [None]:
#cách sử dụng cả các metrics khác để đối chiếu

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial NB": MultinomialNB(),
    "Bernoulli NB": BernoulliNB(),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "MLP Neural Net": MLPClassifier(hidden_layer_sizes=(32,), max_iter=500, random_state=42)
}

results = []

for name, model in models.items():
    if name in ["Logistic Regression", "Linear SVM", "KNN", "MLP Neural Net"]:
        clf = Pipeline([
            ("scaler", StandardScaler(with_mean=False)), 
            ("model", model)
        ])
    else:
        clf = model

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="binary")
    rec = recall_score(y_test, y_pred, average="binary")
    f1 = f1_score(y_test, y_pred, average="binary")

    results.append([name, acc, prec, rec, f1])

df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
df_results = df_results.sort_values(by="Precision", ascending=False).reset_index(drop=True)

print(df_results)
print("\nBest model:", df_results.iloc[0]["Model"], "→ Precision =", df_results.iloc[0]["Precision"])


                 Model  Accuracy  Precision    Recall        F1
0         Bernoulli NB  0.508750   1.000000  0.022388  0.043796
1       Multinomial NB  0.993125   0.990111  0.996269  0.993180
2        Random Forest  0.993125   0.990111  0.996269  0.993180
3                  KNN  0.992500   0.988889  0.996269  0.992565
4       MLP Neural Net  0.987500   0.976886  0.998756  0.987700
5           Linear SVM  0.985000   0.972155  0.998756  0.985276
6  Logistic Regression  0.983125   0.968637  0.998756  0.983466

Best model: Bernoulli NB → Precision = 1.0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd

# Split fixed train/test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.ravel(), test_size=0.2, random_state=42, stratify=Y)

# Define models with basic configuration
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial NB": MultinomialNB(),
    "Bernoulli NB": BernoulliNB(),
    "Linear SVM": LinearSVC(max_iter=2000),
    "Random Forest (100)": RandomForestClassifier(n_estimators=100, random_state=42),
    "Random Forest (200)": RandomForestClassifier(n_estimators=200, random_state=42),
    "KNN (5)": KNeighborsClassifier(n_neighbors=5),
    "KNN (10)": KNeighborsClassifier(n_neighbors=10),
    "MLP Neural Net (50)": MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42),
    "MLP Neural Net (100)": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
}

# Train and calculate precision
results = {}
for name, model in models.items():
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    precision = precision_score(Y_test, y_pred)
    results[name] = precision

df_results = pd.DataFrame(results.items(), columns=["Model", "Precision"])
df_results = df_results.sort_values(by="Precision", ascending=False).reset_index(drop=True)

print(df_results)

best_model = df_results.iloc[0]
print(f"\nBest model: {best_model['Model']} với Precision = {best_model['Precision']:.4f}")


                  Model  Precision
0          Bernoulli NB   1.000000
1        Multinomial NB   0.997503
2            Linear SVM   0.997503
3   MLP Neural Net (50)   0.997503
4               KNN (5)   0.997500
5              KNN (10)   0.997500
6   Random Forest (100)   0.997497
7   Random Forest (200)   0.997497
8  MLP Neural Net (100)   0.996259
9   Logistic Regression   0.992556

Best model: Bernoulli NB với Precision = 1.0000
