In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Load data
train_data = pd.read_csv('train.csv')
train_data = train_data.dropna()

In [3]:
# Preprocess data
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [4]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove stopwords and stem remaining words
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    # Join stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)

    return preprocessed_text

In [5]:
train_data['text'] = train_data['text'].apply(preprocess_text)

In [6]:
# Extract features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
y_train = train_data['sentiment']

In [7]:
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [8]:
# Test model on array of tweets
test_tweets = np.array(['This is a great product!',
                        'I hate this service.',
                        'The food was okay.',
                        'I am not sure how I feel about this.',
                        'Just saw the best movie ever!',
                        'The worst experience ever.'])

In [9]:
# Preprocess test tweets
test_tweets_preprocessed = [preprocess_text(tweet) for tweet in test_tweets]

In [10]:
# Extract features for test tweets
X_test = vectorizer.transform(test_tweets_preprocessed)

In [11]:
# Predict sentiment labels for test tweets
y_pred = model.predict(X_test)

In [12]:
# Print predicted sentiment labels
print(y_pred)

['positive' 'negative' 'neutral' 'negative' 'positive' 'negative']
