In [None]:
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score

nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

df = pd.read_csv('/Users/bbnv/Desktop/NLP/assignment4/sdu-inf376-2024-imdb-lr/train.csv')

disasterTweets = df[df['sentiment'] == 'positive'].copy()
normalTweets = df[df['sentiment'] == 'negative'].copy()

stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text(text):
    text = re.sub(r'@[^\s]+', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

disasterTweets.loc[:, 'review'] = disasterTweets['review'].apply(clean_and_lemmatize_text)
normalTweets.loc[:, 'review'] = normalTweets['review'].apply(clean_and_lemmatize_text)

disaster_word_counts = Counter()
normal_word_counts = Counter()

for text in disasterTweets['review']:
    disaster_word_counts.update(text.split())

for text in normalTweets['review']:
    normal_word_counts.update(text.split())

# Find top 20 words by occurrence
top_disaster_words = disaster_word_counts.most_common(20)
top_normal_words = normal_word_counts.most_common(20)

print("Top 20 words in disaster tweets:")
for word, count in top_disaster_words:
    print(f"{word}: {count}")

print("\nTop 20 words in normal tweets:")
for word, count in top_normal_words:
    print(f"{word}: {count}")

combined_df = pd.concat([disasterTweets, normalTweets])
X = combined_df['review']
y = combined_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

max_features_list = [100, 1000]

for max_features in max_features_list:
    vectorizer = CountVectorizer(max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='positive')
    recall = recall_score(y_test, y_pred, pos_label='positive')

    print(f"\nResults for max_features = {max_features}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"Recall: {recall:.4f}\n")