Imports

In [35]:
import pandas as pd 
import numpy as np 
import re 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, recall_score 
from nltk import ngrams 
from nltk import FreqDist

Data frames

In [36]:
df = pd.read_csv("train.csv") 
 
disasterTweets = df[df['sentiment'] == 'negative']['review'] 
normalTweets = df[df['sentiment'] == 'positive']['review'] 

Part A: EDA (exploratory data analysis):

In [37]:
lemmatizer = WordNetLemmatizer()

disasterWords = [word_tokenize(tweet.lower()) for tweet in disasterTweets]
disasterWordsLemmatized = [[lemmatizer.lemmatize(word) for word in words] for words in disasterWords]

normalWords = [word_tokenize(tweet.lower()) for tweet in normalTweets]
normalWordsLemmatized = [[lemmatizer.lemmatize(word) for word in words] for words in normalWords]

disasterWordsFlat = [word for sublist in disasterWordsLemmatized for word in sublist]
normalWordsFlat = [word for sublist in normalWordsLemmatized for word in sublist]

disasterWordFreq = FreqDist(disasterWordsFlat)
normalWordFreq = FreqDist(normalWordsFlat)

topDisasterWords = disasterWordFreq.most_common(20)
topNormalWords = normalWordFreq.most_common(20)

disasterBigrams = list(ngrams(disasterWordsFlat, 2))
disasterTrigrams = list(ngrams(disasterWordsFlat, 3))

normalBigrams = list(ngrams(normalWordsFlat, 2))
normalTrigrams = list(ngrams(normalWordsFlat, 3))

topDisasterBigrams = FreqDist(disasterBigrams).most_common(20)
topDisasterTrigrams = FreqDist(disasterTrigrams).most_common(20)

topNormalBigrams = FreqDist(normalBigrams).most_common(20)
topNormalTrigrams = FreqDist(normalTrigrams).most_common(20)

print("Top 20 words in disaster tweets:")
print(topDisasterWords)

print("\nTop 20 words in normal tweets:")
print(topNormalWords)

print("\nTop 20 bigrams in disaster tweets:")
print(topDisasterBigrams)

print("\nTop 20 trigrams in disaster tweets:")
print(topDisasterTrigrams)

print("\nTop 20 bigrams in normal tweets:")
print(topNormalBigrams)

print("\nTop 20 trigrams in normal tweets:")
print(topNormalTrigrams)

Top 20 words in disaster tweets:
[('the', 261406), (',', 209886), ('.', 187369), ('a', 159126), ('and', 117985), ('of', 110122), ('to', 109426), ('/', 83091), ('>', 82985), ('<', 82938), ('br', 82900), ('is', 82287), ('it', 81345), ('i', 74405), ('in', 69840), ('this', 65008), ('that', 59420), ("'s", 47238), ('movie', 45174), ('wa', 44364)]

Top 20 words in normal tweets:
[('the', 270822), (',', 224874), ('.', 185001), ('a', 171211), ('and', 140884), ('of', 121152), ('to', 104607), ('is', 91226), ('it', 82089), ('in', 78582), ('/', 77967), ('>', 77849), ('<', 77783), ('br', 77754), ('i', 64626), ('that', 55412), ('this', 55297), ("'s", 50786), ('film', 38435), ('with', 36492)]

Top 20 bigrams in disaster tweets:
[(('<', 'br'), 82900), (('br', '/'), 82900), (('/', '>'), 82900), (('>', '<'), 41457), (('of', 'the'), 28207), (('.', 'the'), 22213), ((',', 'and'), 21243), (('in', 'the'), 19582), (('.', 'i'), 18596), ((',', 'but'), 17241), (('this', 'movie'), 14115), (('it', "'s"), 13292), ((

Part B: Preprocessing part:

In [38]:

lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words('english'))

def preprocessText(tweet):
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)
    tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    
    words = word_tokenize(tweet.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopWords]
    
    return ' '.join(words)

disasterTweetsPreprocessed = [preprocessText(tweet) for tweet in disasterTweets.tolist()]

normalTweetsPreprocessed = [preprocessText(tweet) for tweet in normalTweets.tolist()]

print("Sample disaster tweets after preprocessing:")
for tweet in disasterTweetsPreprocessed[:5]:
    print("#", tweet)

print("\nSample normal tweets after preprocessing:")
for tweet in normalTweetsPreprocessed[:5]:
    print("#", tweet)

df['preprocessed'] = df['review'].apply(preprocessText)

Sample disaster tweets after preprocessing:
# argued whether rent im always afraid renting something ive never heard dont remember theater great castthats tipped scale 30 minute almost stopped watching first minute fun watch unbelievable get worse writer movie could little research future project want make movie even little better could try writing something little bit believable give 3a 1 writing wordsand 2 able get many good actor agree movie despite read script oh god movie suck
# one dullest movie seen time im late 40 watched soninlaw early 20 son 17 scenery beautiful story bust watched hour turned spent time iphone hour watched spent actually watching movie gave 3 enjoyed scenery cinematography otherwise would given 1 im sure people really art find review appalling entitled opinion right couldnt figure supposed chick flick focus mother supposed movie guy focus battle adventure opinion didnt succeed either
# funny thing happening sitcom based main character jim either bad father ba

Part C: Implement Logistic regression

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed'], df['sentiment'], test_size=0.2, random_state=50)


def modelFunction(X_train, X_test, y_train, y_test, max_features):
    vectorizer = CountVectorizer(max_features=max_features)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='positive')
    recall = recall_score(y_test, y_pred, pos_label='positive')

    return accuracy, f1, recall,model


accuracy, f1, recall,model = modelFunction(X_train, X_test, y_train, y_test, 100)
print(f"\nResults for max_features={100}:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")

accuracy, f1, recall,model = modelFunction(X_train, X_test, y_train, y_test, 1000)
print(f"\nResults for max_features={1000}:")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")


test = pd.read_csv('test.csv')

test['preprocessed_text'] = test['review'].apply(preprocessText)

vectorizer = CountVectorizer(max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(test['preprocessed_text'])

y_pred_test = model.predict(X_test_vectorized)

results = pd.DataFrame({'id': test['id'], 'sentiment': y_pred_test})

print("\nPredictions for Test Data:")
print(results.to_string(index=False, header=True, justify='center'))


Results for max_features=100:
Accuracy: 0.739375
F1 Score: 0.7394726977383481
Recall: 0.7380892990770765

Results for max_features=1000:
Accuracy: 0.857875
F1 Score: 0.8586348377471092
Recall: 0.8613120478922425

Predictions for Test Data:
  id  sentiment
21681  positive
10829  positive
37269  positive
13397  negative
41784  positive
33196  positive
43302  negative
38268  positive
 8107  positive
29204  positive
40490  negative
38820  negative
 2884  positive
47759  positive
30850  negative
40121  negative
 6552  positive
14492  negative
19848  negative
27884  positive
49055  negative
11742  negative
36967  positive
49472  positive
45723  positive
 2588  negative
11385  positive
42355  negative
13695  negative
23168  negative
17993  positive
27222  positive
37388  positive
46759  negative
 6541  positive
31650  positive
31769  negative
22437  negative
10764  negative
10496  negative
15356  positive
28615  negative
24269  positive
14731  negative
46686  positive
46383  positive
42072  