In [37]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.util import ngrams
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
train_data = pd.read_csv('train.csv')

test_data = pd.read_csv('test.csv')

train_data

Unnamed: 0,review,sentiment,id
0,I argued with myself whether to rent this or n...,negative,41449
1,This was one of the dullest movies I have seen...,negative,18376
2,"I didn't know what to expect from 'Ned Kelly',...",positive,31081
3,All the funny things happening in this sitcom ...,negative,5696
4,We all know a movie never does complete justic...,negative,5714
...,...,...,...
39995,"Some good movies keep you in front of the TV, ...",negative,13645
39996,I recently watched Caprica again and thought I...,positive,45044
39997,A stupid teen supposed comedy that revolves a ...,negative,19453
39998,When Carol (Vanessa Hidalgo) starts looking in...,negative,13181


In [39]:
disasterTweets=train_data[train_data['sentiment'] == 'positive'][['sentiment', 'id']]
normalTweets=train_data[train_data['sentiment'] == 'negative'][['sentiment', 'id']]

disasterTweets,normalTweets

(      sentiment     id
 2      positive  31081
 5      positive  46036
 6      positive  23583
 7      positive  32635
 8      positive   6022
 ...         ...    ...
 39991  positive  31525
 39993  positive  47203
 39994  positive   5522
 39996  positive  45044
 39999  positive  13151
 
 [19938 rows x 2 columns],
       sentiment     id
 0      negative  41449
 1      negative  18376
 3      negative   5696
 4      negative   5714
 9      negative  18363
 ...         ...    ...
 39989  negative  27611
 39992  negative  34468
 39995  negative  13645
 39997  negative  19453
 39998  negative  13181
 
 [20062 rows x 2 columns])

In [40]:
# Lemmatize all words with NLTK
lemmatizer = WordNetLemmatizer()

def l_the_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text.lower())])

train_data['review'] = train_data['review'].apply(l_the_words)

In [41]:
train_data

Unnamed: 0,review,sentiment,id
0,i argued with myself whether to rent this or n...,negative,41449
1,this wa one of the dullest movie i have seen i...,negative,18376
2,i did n't know what to expect from 'ned kelly ...,positive,31081
3,all the funny thing happening in this sitcom i...,negative,5696
4,we all know a movie never doe complete justice...,negative,5714
...,...,...,...
39995,"some good movie keep you in front of the tv , ...",negative,13645
39996,i recently watched caprica again and thought i...,positive,45044
39997,a stupid teen supposed comedy that revolves a ...,negative,19453
39998,when carol ( vanessa hidalgo ) start looking i...,negative,13181


In [42]:
trigrams = []
bigrams = []
def tokenize_reviews(reviews):
    tokens = []
    for review in reviews:
        tokenized = word_tokenize(review.lower())
        tokens.extend(tokenized)
        bigrams.extend(list(ngrams(tokenized,2)))
        trigrams.extend(list(ngrams(tokenized,3)))

    return tokens

def count_word_occurrences(df, sentiment):
    reviews = df[df['sentiment'] == sentiment]['review']

    tokens = tokenize_reviews(reviews)

    word_counts = Counter(tokens)
    return word_counts


disaster_words = count_word_occurrences(train_data, 'negative').most_common(20)
normal_words = count_word_occurrences(train_data, 'positive').most_common(20)

print('negative sentiment',disaster_words)
print('positive sentiment',normal_words)
print('bigram size',bigrams)
print('trigram size',trigrams)

print('Top 20 bigrams',Counter(bigrams).most_common(20))
print('Top 20 trigrams',Counter(trigrams).most_common(20))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Top 20 bigrams [(('<', 'br'), 160652), (('br', '/'), 160652), (('/', '>'), 160652), (('>', '<'), 80359), (('.', '<'), 67010), (('of', 'the'), 61058), ((',', 'and'), 46747), (('.', 'the'), 41351), (('in', 'the'), 39692), ((',', 'but'), 33357), (('.', 'i'), 30385), ((',', 'the'), 26785), (('it', "'s"), 26602), (('this', 'movie'), 25112), (('.', 'it'), 23104), (('the', 'film'), 21830), (('is', 'a'), 21397), (('and', 'the'), 20944), (('the', 'movie'), 19602), (('to', 'be'), 18773)]
Top 20 trigrams [(('<', 'br', '/'), 160652), (('br', '/', '>'), 160652), (('>', '<', 'br'), 80356), (('/', '>', '<'), 80354), (('.', '<', 'br'), 67005), (('/', '>', 'the'), 11724), (('one', 'of', 'the'), 7790), (('.', 'it', "'s"), 6982), (('/', '>', 'i'), 6795), ((',', 'and', 'the'), 5481), (('!', '!', '!'), 5079), (('i', 'do', "n't"), 4293), (('this', 'movie', 'is'), 4263), ((',', 'it', "'s"), 4211), (('of', 'the', 'film'), 4205), (('.', 'it', 'is'), 4032), (('*', '*', '*'), 3900), (('.', 'this', 'is'), 3868), 

Part B

In [43]:
nltk.download('stopwords')
import re

def partB(text):
  text = re.sub(r'@\w+', '', text)
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

train_data['review'] = train_data['review'].apply(partB)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
train_data

Unnamed: 0,review,sentiment,id
0,argued whether rent always afraid renting some...,negative,41449
1,wa one dullest movie seen time late 40 watched...,negative,18376
2,nt know expect ned kelly absolutely loved wa d...,positive,31081
3,funny thing happening sitcom based main charac...,negative,5696
4,know movie never doe complete justice book exc...,negative,5714
...,...,...,...
39995,good movie keep front tv dying see result br b...,negative,13645
39996,recently watched caprica thought might well co...,positive,45044
39997,stupid teen supposed comedy revolves serious m...,negative,19453
39998,carol vanessa hidalgo start looking brother de...,negative,13181


Part C

In [45]:
m = train_data['sentiment'].unique()

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

train_data.dropna(inplace=True)
train_data['sentiment'] = train_data['sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(train_data['review'], train_data['sentiment'], test_size=0.2, random_state=42)

print(y_train)
max_features_values = [100, 1000]

for max_features in max_features_values:
    vectorizer = CountVectorizer(max_features=max_features)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vectorized, y_train)
    y_pred = model.predict(X_test_vectorized)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Max Features: {max_features}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Recall: {recall}")

14307    0
17812    0
11020    0
15158    1
24990    1
        ..
6265     0
11284    1
38158    1
860      0
15795    0
Name: sentiment, Length: 32000, dtype: int64
Max Features: 100
Accuracy: 0.735875
F1 Score: 0.736369307548347
Recall: 0.753190403266973
Max Features: 1000
Accuracy: 0.861125
F1 Score: 0.8604797186989828
Recall: 0.8744257274119449
