In [64]:
#Part A
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.util import ngrams
import re

# Load data
data = pd.read_csv("/Users/inkarbaidaly/Downloads/train-2.csv")
data

Unnamed: 0,review,sentiment,id
0,I argued with myself whether to rent this or n...,negative,41449
1,This was one of the dullest movies I have seen...,negative,18376
2,"I didn't know what to expect from 'Ned Kelly',...",positive,31081
3,All the funny things happening in this sitcom ...,negative,5696
4,We all know a movie never does complete justic...,negative,5714
...,...,...,...
39995,"Some good movies keep you in front of the TV, ...",negative,13645
39996,I recently watched Caprica again and thought I...,positive,45044
39997,A stupid teen supposed comedy that revolves a ...,negative,19453
39998,When Carol (Vanessa Hidalgo) starts looking in...,negative,13181


In [65]:
import nltk
print(nltk.data.path)

['/Users/inkarbaidaly/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data', '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [66]:
# Create two datasets based on the "sentiment" column's value
normal_tweets = data[data["sentiment"] == "positive"]
disaster_tweets = data[data["sentiment"] == "negative"]
# Optional: Print the first few rows of each dataset
print("\nFirst few rows of positive_dataset:")
normal_tweets


First few rows of positive_dataset:


Unnamed: 0,review,sentiment,id
2,"I didn't know what to expect from 'Ned Kelly',...",positive,31081
5,i read the book before i saw the movie i knew ...,positive,46036
6,This has to be the ultimate chick flick ever. ...,positive,23583
7,"A light-hearted comedy, Nothing shows us a wor...",positive,32635
8,"Hadnt heard a lot about this movie, except it ...",positive,6022
...,...,...,...
39991,In what appears an attempt to mix drama and co...,positive,31525
39993,This is an excellent film!Tom Hanks and Paul N...,positive,47203
39994,I too was quite astonished to see how few peop...,positive,5522
39996,I recently watched Caprica again and thought I...,positive,45044


In [67]:
print("\nFirst few rows of negative_dataset:")
disaster_tweets


First few rows of negative_dataset:


Unnamed: 0,review,sentiment,id
0,I argued with myself whether to rent this or n...,negative,41449
1,This was one of the dullest movies I have seen...,negative,18376
3,All the funny things happening in this sitcom ...,negative,5696
4,We all know a movie never does complete justic...,negative,5714
9,"The film, a Universal release of a Protelco-ML...",negative,18363
...,...,...,...
39989,I can say without a shadow of a doubt that Goi...,negative,27611
39992,"Sarah Silverman is really the ""flavor of the m...",negative,34468
39995,"Some good movies keep you in front of the TV, ...",negative,13645
39997,A stupid teen supposed comedy that revolves a ...,negative,19453


In [79]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@[^\s]+', '', text)  
    text = re.sub(r'[^\w\s]', '', text)   
    tokens = word_tokenize(text.lower()) 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  
    return ' '.join(tokens)

disaster_tweets.loc[:, 'review'] = disaster_tweets['review'].apply(preprocess_text)
normal_tweets.loc[:, 'review'] = normal_tweets['review'].apply(preprocess_text)

print("Disaster Dataset after Lemmatization:")
disaster_tweets['review'].head()

Disaster Dataset after Lemmatization:


0    argued whether rent im always afraid renting s...
1    one dullest movie seen time im late 40 watched...
3    funny thing happening sitcom based main charac...
4    know movie never complete justice book excepti...
9    film universal release protelcomlc production ...
Name: review, dtype: object

In [80]:
print("\nNormal Dataset after Lemmatization:")
normal_tweets['review'].head()


Normal Dataset after Lemmatization:


2    didnt know expect ned kelly absolutely loved d...
5    read book saw movie knew movie going good book...
6    ultimate chick flick ever taped tv year ago iv...
7    lighthearted comedy nothing show u world somet...
8    hadnt heard lot movie except national award os...
Name: review, dtype: object

In [83]:
from collections import Counter

disaster_word_counts = Counter()
normal_word_counts = Counter()

for index, row in disaster_tweets.iterrows():
    tokens = row['review'].split() 
    disaster_word_counts.update(tokens) 

for index, row in normal_tweets.iterrows():
    tokens = row['review'].split()  
    normal_word_counts.update(tokens)  

print("Word Counts in Disaster Tweets:")
print(disaster_word_counts)

print("\nWord Counts in Normal Tweets:")
print(normal_word_counts)

Word Counts in Disaster Tweets:

Word Counts in Normal Tweets:


In [86]:
top_disaster_words = disaster_word_counts.most_common(20)
top_normal_words = normal_word_counts.most_common(20)

print("Top 20 words in disaster tweets:")
print(top_disaster_words)
print("\nTop 20 words in normal tweets:")
print(top_normal_words)

Top 20 words in disaster tweets:
[('br', 46636), ('movie', 44780), ('film', 33760), ('one', 20826), ('like', 17865), ('even', 12132), ('good', 11369), ('bad', 11328), ('time', 11317), ('character', 11141), ('would', 10886), ('get', 10574), ('make', 10100), ('really', 9752), ('see', 8847), ('scene', 8833), ('story', 8423), ('dont', 8196), ('much', 7974), ('people', 7482)]

Top 20 words in normal tweets:
[('br', 43913), ('film', 38171), ('movie', 34550), ('one', 21904), ('like', 14116), ('time', 12166), ('good', 11550), ('character', 10960), ('story', 10858), ('great', 10151), ('see', 10116), ('well', 9035), ('get', 8836), ('make', 8738), ('really', 8573), ('also', 8539), ('would', 8279), ('scene', 7731), ('even', 7466), ('show', 7435)]


In [88]:
def preprocess_and_extract_ngrams(text, n):
    text = re.sub(r'http\S+', '', text) 
    text = re.sub(r'@[^\s]+', '', text) 
    text = re.sub(r'[^\w\s]', '', text)   
    tokens = word_tokenize(text.lower())  
    tokens = [word for word in tokens if word not in stop_words] 
    n_grams = list(ngrams(tokens, n)) 
    return n_grams

disaster_bigrams = disaster_tweets['review'].apply(lambda x: preprocess_and_extract_ngrams(x, 2))
disaster_trigrams = disaster_tweets['review'].apply(lambda x: preprocess_and_extract_ngrams(x, 3))

normal_bigrams = normal_tweets['review'].apply(lambda x: preprocess_and_extract_ngrams(x, 2))
normal_trigrams = normal_tweets['review'].apply(lambda x: preprocess_and_extract_ngrams(x, 3))

disaster_flat_bigrams = [bigram for sublist in disaster_bigrams for bigram in sublist]
disaster_flat_trigrams = [trigram for sublist in disaster_trigrams for trigram in sublist]

normal_flat_bigrams = [bigram for sublist in normal_bigrams for bigram in sublist]
normal_flat_trigrams = [trigram for sublist in normal_trigrams for trigram in sublist]


disaster_bigram_counts = Counter(disaster_flat_bigrams)
disaster_trigram_counts = Counter(disaster_flat_trigrams)

normal_bigram_counts = Counter(normal_flat_bigrams)
normal_trigram_counts = Counter(normal_flat_trigrams)


top_disaster_bigrams = disaster_bigram_counts.most_common(20)
top_disaster_trigrams = disaster_trigram_counts.most_common(20)

top_normal_bigrams = normal_bigram_counts.most_common(20)
top_normal_trigrams = normal_trigram_counts.most_common(20)

print("Top 20 bigrams in disaster tweets:")
print(top_disaster_bigrams)
print("\nTop 20 trigrams in disaster tweets:")
print(top_disaster_trigrams)

print("\nTop 20 bigrams in normal tweets:")
print(top_normal_bigrams)
print("\nTop 20 trigrams in normal tweets:")
print(top_normal_trigrams)

Top 20 bigrams in disaster tweets:
[(('br', 'br'), 5207), (('look', 'like'), 1749), (('ever', 'seen'), 1314), (('special', 'effect'), 1159), (('br', 'movie'), 1127), (('waste', 'time'), 1082), (('dont', 'know'), 936), (('bad', 'movie'), 906), (('br', 'film'), 890), (('worst', 'movie'), 880), (('itbr', 'br'), 874), (('main', 'character'), 869), (('ive', 'seen'), 868), (('movie', 'like'), 824), (('movie', 'ever'), 817), (('moviebr', 'br'), 813), (('horror', 'movie'), 790), (('much', 'better'), 766), (('one', 'worst'), 750), (('even', 'though'), 720)]

Top 20 trigrams in disaster tweets:
[(('ive', 'ever', 'seen'), 514), (('worst', 'movie', 'ever'), 440), (('movie', 'ever', 'seen'), 293), (('dont', 'waste', 'time'), 274), (('one', 'worst', 'movie'), 255), (('movie', 'ive', 'ever'), 229), (('worst', 'film', 'ever'), 218), (('movie', 'ever', 'made'), 189), (('worst', 'movie', 'ive'), 186), (('film', 'ever', 'seen'), 160), (('waste', 'time', 'money'), 146), (('one', 'worst', 'film'), 141), ((

In [None]:
###Part B

In [91]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@[^\s]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

disaster_tweets.loc[:, 'review'] = disaster_tweets['review'].apply(preprocess_text)
normal_tweets.loc[:, 'review'] = normal_tweets['review'].apply(preprocess_text)

print("Disaster Dataset after Preprocessing:")
print(disaster_tweets['review'].head())

print("\nNormal Dataset after Preprocessing:")
print(normal_tweets['review'].head())

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


Disaster Dataset after Preprocessing:
0    argued whether rent im always afraid renting s...
1    one dullest movie seen time im late 40 watched...
3    funny thing happening sitcom based main charac...
4    know movie never complete justice book excepti...
9    film universal release protelcomlc production ...
Name: review, dtype: object

Normal Dataset after Preprocessing:
2    didnt know expect ned kelly absolutely loved d...
5    read book saw movie knew movie going good book...
6    ultimate chick flick ever taped tv year ago iv...
7    lighthearted comedy nothing show u world somet...
8    hadnt heard lot movie except national award os...
Name: review, dtype: object


In [None]:
#Part C

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score


X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

max_features_values = [100, 1000]

for max_features in max_features_values:
    vectorizer = CountVectorizer(max_features=max_features)
    
    X_train_vectorized = vectorizer.fit_transform(X_train)
    
    X_test_vectorized = vectorizer.transform(X_test)
    
    model = LogisticRegression(max_iter=1000)
    
    model.fit(X_train_vectorized, y_train)
    
    y_pred = model.predict(X_test_vectorized)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print(f"\nPerformance with max_features={max_features}:")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")


Performance with max_features=100:
Accuracy: 0.733625
F1 Score: 0.7335920477969484
Recall: 0.733625

Performance with max_features=1000:
Accuracy: 0.865
F1 Score: 0.8650131632404323
Recall: 0.865
