In [37]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk import ngrams

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
train = pd.read_csv("train.csv")
train

train['sentiment'] = train['sentiment'].apply(lambda x : 1 if x=='positive' else 0)
train

Unnamed: 0,review,sentiment,id
0,I argued with myself whether to rent this or n...,0,41449
1,This was one of the dullest movies I have seen...,0,18376
2,"I didn't know what to expect from 'Ned Kelly',...",1,31081
3,All the funny things happening in this sitcom ...,0,5696
4,We all know a movie never does complete justic...,0,5714
...,...,...,...
39995,"Some good movies keep you in front of the TV, ...",0,13645
39996,I recently watched Caprica again and thought I...,1,45044
39997,A stupid teen supposed comedy that revolves a ...,0,19453
39998,When Carol (Vanessa Hidalgo) starts looking in...,0,13181


In [55]:
# Split the DataFrame based on the 'sentiment' column
disaster_tweets = train[train['sentiment'] == 1]
normal_tweets = train[train['sentiment'] == 0]

In [56]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Remove mentions (@)
    text = re.sub(r'@\w+', '', text)
    # Remove punctuations and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens


In [57]:
# Preprocess disaster tweets
train['preprocessed_tweet'] = train['review'].apply(preprocess_text)


In [58]:
# Initialize lists to store preprocessed tokens
disaster_all_words = []
normal_all_words = []

# Iterate through each row in the DataFrame
for index, row in train.iterrows():
    # Check the value of the 'sentiment' column
    if row['sentiment'] == 1:
        # Concatenate tokens for disaster tweets
        disaster_all_words.extend(row['preprocessed_tweet'])
    elif row['sentiment'] == 0:
        # Concatenate tokens for normal tweets
        normal_all_words.extend(row['preprocessed_tweet'])



In [59]:
# Count occurrences of each word in disaster and normal tweets
disaster_word_c = Counter(disaster_all_words)
normal_word_c = Counter(normal_all_words)

In [60]:
# Print word counts for disaster tweets
print("Word Counts for Disaster Tweets:")
for word, count in disaster_word_c.items():
    print(word, "-", count)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Brimleys - 1
Hillermans - 1
Lowrie - 1
brashnessbr - 1
Buckner - 2
fictionalizedbr - 1
McCallister - 1
Paging - 1
Tripping - 1
Lizziebr - 1
DESCRIPTION - 1
Greeley - 1
strandsand - 1
Kathleens - 1
Socorro - 1
Pisanabr - 1
IIScamps - 1
aparthied - 2
marketor - 1
dimlylighted - 1
Venturing - 1
Deulling - 1
Banjosbr - 1
cityslickers - 2
sitesbr - 1
everydaylife - 1
pitchedinto - 1
Dicey - 1
canoers - 1
Aintry - 1
mountainmen - 1
mountainmenbr - 1
fightforyourlife - 1
neardangerous - 1
30outof10 - 1
subjectively - 1
imPeckable - 1
skyward - 1
womenAdela - 1
Isabelwho - 1
affixed - 1
podiatrist - 1
coalesce - 3
Rewind - 1
Unveil - 1
piedras - 1
nearheartening - 1
augmentation - 1
redirects - 1
subjectified - 1
Spirogolous - 1
spendthrift - 1
seconded - 1
penancebr - 1
invadersbr - 1
breachedbr - 1
Bhukya - 1
Landlady - 1
Dammannagari - 1
Landladys - 1
Karan - 1
Landladybr - 1
Uhoh - 1
Militant - 1
Ironsides - 

In [61]:
# Print word counts for normal tweets
print("\nWord Counts for Normal Tweets:")
for word, count in normal_word_c.items():
    print(word, "-", count)


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Reptilian - 1
Acting310br - 1
drinkbut - 1
wittiest - 1
Genuine - 2
Kermode - 1
Theakston - 1
Moo - 1
RAGING - 1
BULL - 1
GOODFELLAS - 1
AVIATOR - 2
cert - 1
MDBs - 1
gawds - 1
matchinsult - 1
noisey - 1
Kristofersons - 1
ORGANIZED - 1
Artistsbr - 1
Anyhoobr - 1
Jaffes - 1
penandpaper - 1
DDs - 1
controversythat - 1
horrendouslybr - 1
socialize - 1
alsothough - 1
differentbut - 1
prig - 1
Entrance - 1
RobbiePardu - 1
oafiest - 1
Pardus - 1
Caporetto - 2
CourtsMartial - 1
Martinique - 1
impassive - 1
picturepostcard - 1
Kerenskys - 1
armistice - 1
Treaty - 1
BrestLitovsk - 1
Masiela - 1
Lusha - 1
Slaves - 1
studiously - 1
clubkids - 1
scenester - 1
Breakin - 1
Bugaloo - 1
copypaste - 1
22h45 - 1
Daens - 1
dummywithout - 1
Castevets - 2
holdover - 1
Maharis - 1
Blackmer - 1
idealets - 1
leaddirector - 1
Swanberg - 2
trueness - 1
postcollege - 1
Winterich - 1
watchinglistening - 1
showerhabits - 1
Toddlers -

In [62]:
# Find top 20 words by occurrence for disaster and normal tweets
top_disaster_words = disaster_word_c.most_common(20)
top_normal_words = normal_word_c.most_common(20)

print("Top 20 words in disaster tweets:")
for word, count in top_disaster_words:
    print(f"{word}: {count}")

print("\nTop 20 words in normal tweets:")
for word, count in top_normal_words:
    print(f"{word}: {count}")

Top 20 words in disaster tweets:
br: 43911
film: 37572
movie: 34005
one: 19864
like: 13479
time: 11862
good: 11023
character: 10860
story: 10502
see: 9744
great: 9375
get: 8682
make: 8622
really: 8361
would: 8177
well: 8027
scene: 7682
also: 7449
show: 7148
much: 6983

Top 20 words in normal tweets:
br: 46635
movie: 44090
film: 33368
one: 19049
like: 17288
time: 11046
character: 11002
good: 10966
would: 10775
bad: 10704
even: 10638
get: 10353
make: 9962
really: 9392
scene: 8728
see: 8606
story: 8166
much: 7755
thing: 7228
could: 7135


In [63]:
# Find bigrams and trigrams range
disaster_bigrams = list(ngrams(disaster_all_words, 2))
normal_bigrams = list(ngrams(normal_all_words, 2))

disaster_trigrams = list(ngrams(disaster_all_words, 3))
normal_trigrams = list(ngrams(normal_all_words, 3))

In [64]:
# Find top 20 bigrams and trigrams
bigram_counts = Counter(ngrams(disaster_all_words, 2))
trigram_counts = Counter(ngrams(disaster_all_words, 3))

top_20_bigrams = bigram_counts.most_common(20)
top_20_trigrams = trigram_counts.most_common(20)

# Find top 20 bigrams and trigrams
bigram_count = Counter(ngrams(normal_all_words, 2))
trigram_count = Counter(ngrams(normal_all_words, 3))

top_20_bigram = bigram_counts.most_common(20)
top_20_trigram = trigram_counts.most_common(20)

In [65]:
# Print top 20 bigrams and trigrams
print("Top 20 Bigrams:")
for bigram, count in top_20_bigrams:
    print(' '.join(bigram), "-", count)

print("\nTop 20 Trigrams:")
for trigram, count in top_20_trigrams:
    print(' '.join(trigram), "-", count)


Top 20 Bigrams:
br br - 5070
br film - 1180
one best - 1166
br movie - 880
Ive seen - 826
first time - 714
ever seen - 707
itbr br - 703
dont know - 674
even though - 658
main character - 655
New York - 636
filmbr br - 600
year old - 597
special effect - 589
look like - 578
see movie - 572
year ago - 572
good movie - 552
movie like - 548

Top 20 Trigrams:
Ive ever seen - 266
New York City - 134
br br film - 128
World War II - 123
film Ive seen - 122
film ever made - 105
one best movie - 105
movie Ive seen - 96
movie br br - 94
one best film - 92
movie ever seen - 91
first time saw - 89
br br movie - 89
film br br - 85
movie ever made - 85
first saw movie - 78
best movie ever - 73
based true story - 73
br Title Brazil - 70
highly recommend movie - 67


In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [68]:
# Convert 'preprocessed_tweet' column to strings
train['preprocessed_tweet'] = train['preprocessed_tweet'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)


# Initialize CountVectorizer with different values of max_features
max_features_values = [100, 1000]


for max_features in max_features_values:
    print(f"\nMax Features: {max_features}")

    # Vectorize preprocessed text
    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(train['preprocessed_tweet'])

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, train['sentiment'], test_size=0.2, random_state=42)

    # Train Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Print evaluation metrics
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")


Max Features: 100
Accuracy: 0.73175
F1 Score: 0.73175397345536
Recall: 0.73175

Max Features: 1000
Accuracy: 0.8595
F1 Score: 0.8595119165502411
Recall: 0.8595


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
