In [9]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score


In [10]:
try:
    df = pd.read_csv('/content/review.csv')
except pd.errors.ParserError as e:
    print("ParserError occurred:", e)


disaster_tweets = df[df['sentiment'] == 'negative']['review'].tolist()
normal_tweets = df[df['sentiment'] == 'positive']['review'].tolist()

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()


disaster_words = [word_tokenize(tweet.lower()) for tweet in disaster_tweets]
disaster_words_lemmatized = [[lemmatizer.lemmatize(word) for word in words] for words in disaster_words]


normal_words = [word_tokenize(tweet.lower()) for tweet in normal_tweets]
normal_words_lemmatized = [[lemmatizer.lemmatize(word) for word in words] for words in normal_words]

disaster_words_flat = [word for sublist in disaster_words_lemmatized for word in sublist]
normal_words_flat = [word for sublist in normal_words_lemmatized for word in sublist]

disaster_word_freq = FreqDist(disaster_words_flat)
normal_word_freq = FreqDist(normal_words_flat)

top_disaster_words = disaster_word_freq.most_common(20)
top_normal_words = normal_word_freq.most_common(20)

disaster_bigrams = list(ngrams(disaster_words_flat, 2))
disaster_trigrams = list(ngrams(disaster_words_flat, 3))

normal_bigrams = list(ngrams(normal_words_flat, 2))
normal_trigrams = list(ngrams(normal_words_flat, 3))

top_disaster_bigrams = FreqDist(disaster_bigrams).most_common(20)
top_disaster_trigrams = FreqDist(disaster_trigrams).most_common(20)

top_normal_bigrams = FreqDist(normal_bigrams).most_common(20)
top_normal_trigrams = FreqDist(normal_trigrams).most_common(20)


print("Top 20 words in disaster tweets:")
print(top_disaster_words)

print("\nTop 20 words in normal tweets:")
print(top_normal_words)

print("\nTop 20 bigrams in disaster tweets:")
print(top_disaster_bigrams)

print("\nTop 20 trigrams in disaster tweets:")
print(top_disaster_trigrams)

print("\nTop 20 bigrams in normal tweets:")
print(top_normal_bigrams)

print("\nTop 20 trigrams in normal tweets:")
print(top_normal_trigrams)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 20 words in disaster tweets:
[('the', 261406), (',', 209886), ('.', 187369), ('a', 159126), ('and', 117985), ('of', 110122), ('to', 109426), ('/', 83091), ('>', 82985), ('<', 82938), ('br', 82900), ('is', 82287), ('it', 81345), ('i', 74405), ('in', 69840), ('this', 65008), ('that', 59420), ("'s", 47238), ('movie', 45174), ('wa', 44364)]

Top 20 words in normal tweets:
[('the', 270822), (',', 224874), ('.', 185001), ('a', 171211), ('and', 140884), ('of', 121152), ('to', 104607), ('is', 91226), ('it', 82089), ('in', 78582), ('/', 77967), ('>', 77849), ('<', 77783), ('br', 77754), ('i', 64626), ('that', 55412), ('this', 55297), ("'s", 50786), ('film', 38435), ('with', 36492)]

Top 20 bigrams in disaster tweets:
[(('<', 'br'), 82900), (('br', '/'), 82900), (('/', '>'), 82900), (('>', '<'), 41457), (('of', 'the'), 28207), (('.', 'the'), 22213), ((',', 'and'), 21243), (('in', 'the'), 19582), (('.', 'i'), 18596), ((',', 'but'), 17241), (('this', 'movie'), 14115), (('it', "'s"), 13292), ((

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#remove @ punctuation stop words etc.
def preprocess_tweet(tweet):
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
    tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)


    words = word_tokenize(tweet.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)


disaster_tweets_preprocessed = [preprocess_tweet(tweet) for tweet in disaster_tweets]
normal_tweets_preprocessed = [preprocess_tweet(tweet) for tweet in normal_tweets]

print("Sample disaster tweets after preprocessing:")
for tweet in disaster_tweets_preprocessed[:5]:
    print("-", tweet)

print("\nSample normal tweets after preprocessing:")
for tweet in normal_tweets_preprocessed[:5]:
    print("-", tweet)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample disaster tweets after preprocessing:
- argued whether rent im always afraid renting something ive never heard dont remember theater great castthats tipped scale 30 minute almost stopped watching first minute fun watch unbelievable get worse writer movie could little research future project want make movie even little better could try writing something little bit believable give 3a 1 writing wordsand 2 able get many good actor agree movie despite read script oh god movie suck
- one dullest movie seen time im late 40 watched soninlaw early 20 son 17 scenery beautiful story bust watched hour turned spent time iphone hour watched spent actually watching movie gave 3 enjoyed scenery cinematography otherwise would given 1 im sure people really art find review appalling entitled opinion right couldnt figure supposed chick flick focus mother supposed movie guy focus battle adventure opinion didnt succeed either
- funny thing happening sitcom based main character jim either bad father ba

In [8]:
df_preprocessed = pd.read_csv('/content/review.csv')


X = df_preprocessed['review']
y = df_preprocessed['sentiment']


vectorizer_100 = CountVectorizer(max_features=100)
X_vectorized_100 = vectorizer_100.fit_transform(X)
vectorizer_1000 = CountVectorizer(max_features=1000)
X_vectorized_1000 = vectorizer_1000.fit_transform(X)


X_train_100, X_test_100, y_train, y_test = train_test_split(X_vectorized_100, y, test_size=0.2, random_state=42)
X_train_1000, X_test_1000, y_train, y_test = train_test_split(X_vectorized_1000, y, test_size=0.2, random_state=42)


logreg_100 = LogisticRegression(max_iter=1000)
logreg_100.fit(X_train_100, y_train)


y_pred_100 = logreg_100.predict(X_test_100)
accuracy_100 = accuracy_score(y_test, y_pred_100)
f1_100 = f1_score(y_test, y_pred_100, average='weighted')
recall_100 = recall_score(y_test, y_pred_100, average='weighted')


logreg_1000 = LogisticRegression(max_iter=1000)
logreg_1000.fit(X_train_1000, y_train)


y_pred_1000 = logreg_1000.predict(X_test_1000)
accuracy_1000 = accuracy_score(y_test, y_pred_1000)
f1_1000 = f1_score(y_test, y_pred_1000, average='weighted')
recall_1000 = recall_score(y_test, y_pred_1000, average='weighted')


print("Performance with max_features=100:")
print("Accuracy:", accuracy_100)
print("F1 Score:", f1_100)
print("Recall:", recall_100)

print("\nPerformance with max_features=1000:")
print("Accuracy:", accuracy_1000)
print("F1 Score:", f1_1000)
print("Recall:", recall_1000)

Performance with max_features=100:
Accuracy: 0.733625
F1 Score: 0.7335920477969484
Recall: 0.733625

Performance with max_features=1000:
Accuracy: 0.864625
F1 Score: 0.8646381045897199
Recall: 0.864625
