In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.util import ngrams
import re


In [2]:
nltk.download('punkt')  
nltk.download('stopwords')  

[nltk_data] Downloading package punkt to /home/xan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# TASK A
data = pd.read_csv('train.csv')
disasterTweets = data[data['sentiment'] == 'positive']
normalTweets = data[data['sentiment'] == 'negative']


In [5]:
def lemmatize_text(text):
    # nltk.download('wordnet')  
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  
    return [lemmatizer.lemmatize(token) for token in tokens]

disaster_lemmatized = disasterTweets['review'].apply(lemmatize_text)
normal_lemmatized = normalTweets['review'].apply(lemmatize_text)


In [6]:
disaster_word_counts = Counter()
for doc in disaster_lemmatized:
    disaster_word_counts.update(doc)

normal_word_counts = Counter()
for doc in normal_lemmatized:
    normal_word_counts.update(doc)


In [7]:
top_disaster_words = disaster_word_counts.most_common(20)
top_normal_words = normal_word_counts.most_common(20)

print("Top 20 Words (Disaster):")
print(top_disaster_words)

print("Top 20 Words (Normal):")
print(top_normal_words)


Top 20 Words (Disaster):
[('the', 270822), (',', 224874), ('.', 185001), ('a', 171211), ('and', 140884), ('of', 121152), ('to', 104607), ('is', 91226), ('it', 82089), ('in', 78582), ('/', 77967), ('>', 77849), ('<', 77783), ('br', 77754), ('i', 64626), ('that', 55412), ('this', 55297), ("'s", 50786), ('film', 38435), ('with', 36492)]
Top 20 Words (Normal):
[('the', 261406), (',', 209886), ('.', 187369), ('a', 159126), ('and', 117985), ('of', 110122), ('to', 109426), ('/', 83091), ('>', 82985), ('<', 82938), ('br', 82900), ('is', 82287), ('it', 81345), ('i', 74405), ('in', 69840), ('this', 65008), ('that', 59420), ("'s", 47238), ('movie', 45174), ('wa', 44364)]


In [8]:
def find_ngrams(text, n):
    return ngrams(text, n)

disaster_bigrams = Counter()
disaster_trigrams = Counter()

for doc in disaster_lemmatized:
    disaster_bigrams.update(find_ngrams(doc, 2))
    disaster_trigrams.update(find_ngrams(doc, 3))

normal_bigrams = Counter()
normal_trigrams = Counter()

for doc in normal_lemmatized:
    normal_bigrams.update(find_ngrams(doc, 2))
    normal_trigrams.update(find_ngrams(doc, 3))



In [18]:
# TASK B
def remove_special_chars(text):
    pattern = r"[^a-zA-Z0-9\s]"  
    return re.sub(pattern, '', text)

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

def remove_mentions(text):
    pattern = r"@\w+"  
    return re.sub(pattern, '', text)

def preprocess_text(text):
    text = remove_special_chars(text.lower())  
    text = remove_mentions(text)  
    text = remove_stopwords(text)  
    return text

disasterTweets['processed_text'] = disasterTweets['review'].apply(preprocess_text)
normalTweets['processed_text'] = normalTweets['review'].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  disasterTweets['processed_text'] = disasterTweets['review'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normalTweets['processed_text'] = normalTweets['review'].apply(preprocess_text)


In [31]:
# TASK C
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

processed_text = pd.concat([disasterTweets['processed_text'], normalTweets['processed_text']])
labels = [1] * len(disasterTweets) + [0] * len(normalTweets)

def evaluate_model(max_features):
    vectorizer = CountVectorizer(max_features=max_features) 
    features = vectorizer.fit_transform(processed_text)

    model = LogisticRegression(max_iter=1000)  
    model.fit(features, labels)

    predictions = model.predict(features)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')  
    recall = recall_score(labels, predictions)

    print(f"Max Features: {max_features}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Recall: {recall:.4f}")
    print("-" * 30)

for max_features in range(100, 1100, 100):
    evaluate_model(max_features)  

Max Features: 100
Accuracy: 0.7343
F1-Score: 0.7373
Recall: 0.7479
------------------------------
Max Features: 200
Accuracy: 0.7799
F1-Score: 0.7836
Recall: 0.7995
------------------------------
Max Features: 300
Accuracy: 0.8160
F1-Score: 0.8184
Recall: 0.8317
------------------------------
Max Features: 400
Accuracy: 0.8369
F1-Score: 0.8389
Recall: 0.8517
------------------------------
Max Features: 500
Accuracy: 0.8443
F1-Score: 0.8459
Recall: 0.8572
------------------------------
Max Features: 600
Accuracy: 0.8507
F1-Score: 0.8521
Recall: 0.8624
------------------------------
Max Features: 700
Accuracy: 0.8588
F1-Score: 0.8601
Recall: 0.8708
------------------------------
Max Features: 800
Accuracy: 0.8643
F1-Score: 0.8655
Recall: 0.8754
------------------------------
Max Features: 900
Accuracy: 0.8697
F1-Score: 0.8709
Recall: 0.8812
------------------------------
Max Features: 1000
Accuracy: 0.8721
F1-Score: 0.8729
Recall: 0.8809
------------------------------
