In [10]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

df = pd.read_csv('/content/train.csv')

positiveReviews = df[df['sentiment'] == 'positive']['review']
negativeReviews = df[df['sentiment'] == 'negative']['review']

lemmatizer = WordNetLemmatizer()

def preprocess_reviews(reviews):
    lemmatized_words = []
    for review in reviews:
        review = review.lower()
        review = re.sub(r'https?://\S+|www\.\S+|@[A-Za-z0-9]+|#|\d+', '', review)
        tokens = word_tokenize(review)
        lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
        lemmatized_words.extend(lemmatized)
    return lemmatized_words

lemmatized_positive = preprocess_reviews(positiveReviews)
lemmatized_negative = preprocess_reviews(negativeReviews)

positive_word_counts = Counter(lemmatized_positive)
negative_word_counts = Counter(lemmatized_negative)

top20_positive_words = positive_word_counts.most_common(20)
top20_negative_words = negative_word_counts.most_common(20)

def find_ngrams(input_list, n):
    n_grams = ngrams(input_list, n)
    return Counter(n_grams)

positive_bigrams = find_ngrams(lemmatized_positive, 2)
positive_trigrams = find_ngrams(lemmatized_positive, 3)

print("Top 20 Positive Bigrams:", positive_bigrams.most_common(20))
print("Top 20 Positive Trigrams:", positive_trigrams.most_common(20))

negative_bigrams = find_ngrams(lemmatized_negative, 2)
negative_trigrams = find_ngrams(lemmatized_negative, 3)

print("Top 20 Negative Bigrams:", negative_bigrams.most_common(20))
print("Top 20 Negative Trigrams:", negative_trigrams.most_common(20))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 20 Positive Bigrams: [(('/', '>'), 77752), (('<', 'br'), 77742), (('br', '/'), 77742), (('>', '<'), 38904), (('of', 'the'), 32863), ((',', 'and'), 25524), (('.', 'the'), 20995), (('in', 'the'), 20123), (('.', 'i'), 17576), ((',', 'but'), 16115), ((',', 'the'), 13638), (('it', "'s"), 13312), (('is', 'a'), 12790), (('.', 'it'), 12579), (('the', 'film'), 11688), (('and', 'the'), 11299), (('this', 'movie'), 10675), (('to', 'the'), 10031), (('it', 'is'), 9378), ((',', 'a'), 8715)]
Top 20 Positive Trigrams: [(('<', 'br', '/'), 77742), (('br', '/', '>'), 77742), (('>', '<', 'br'), 38901), (('/', '>', '<'), 38899), (('/', '>', 'the'), 5831), (('.', '<', 'br'), 5033), (('one', 'of', 'the'), 4627), (('.', 'it', "'s"), 3805), (('/', '>', 'i'), 3163), (('.', 'this', 'is'), 2976), ((',', 'and', 'the'), 2881), (('!', '!', '!'), 2604), (('.', 'it', 'is'), 2517), (('this', 'is', 'a'), 2412), (('it', "'s", 'a'), 2226), (('of', 'the', 'film'), 2183), ((',', 'it', "'s"), 2047), (('a', 'lot', 'of'), 2

In [11]:
def full_preprocess_reviews(reviews):
    stop_words = set(stopwords.words('english'))
    preprocessed_reviews = []
    for review in reviews:
        review = re.sub(r'https?://\S+|www\.\S+|@[A-Za-z0-9]+|#|\d+', '', review)
        review = re.sub(r'[^\w\s]', '', review)
        review = review.lower()
        tokens = word_tokenize(review)
        lemmatized = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        preprocessed_reviews.append(' '.join(lemmatized))
    return preprocessed_reviews

preprocessed_positive = full_preprocess_reviews(positiveReviews)
preprocessed_negative = full_preprocess_reviews(negativeReviews)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score
import pandas as pd

df_preprocessed = pd.DataFrame({
    'text': preprocessed_positive + preprocessed_negative,
    'target': [1] * len(preprocessed_positive) + [0] * len(preprocessed_negative)
})

X = df_preprocessed['text']
y = df_preprocessed['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_evaluate_logistic_regression(max_features):
    vectorizer = CountVectorizer(max_features=max_features)
    X_train_vect = vectorizer.fit_transform(X_train)
    X_test_vect = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vect, y_train)
    predictions = model.predict(X_test_vect)

    print(f"Max Features: {max_features}")
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("F1 Score:", f1_score(y_test, predictions))
    print("Recall:", recall_score(y_test, predictions))
    print("")

train_evaluate_logistic_regression(100)
train_evaluate_logistic_regression(1000)


Max Features: 100
Accuracy: 0.738125
F1 Score: 0.7424708051628766
Recall: 0.7538691962056915

Max Features: 1000
Accuracy: 0.852125
F1 Score: 0.8529155787641426
Recall: 0.8562156764852721

