In [10]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [13]:
# Load data
data = pd.read_csv('train.csv', encoding='utf-8')
data.head()

Unnamed: 0,text,sentiment
0,@united UA5396 can wait for me. I'm on the gro...,negative
1,I hate Time Warner! Soooo wish I had Vios. Can...,negative
2,"@united Oh, we are sure it's not planned, but ...",negative
3,Tom Shanahan's latest column on SDSU and its N...,neutral
4,Found the self driving car!! /IWo3QSvduneutral,neutral


In [14]:
# Preprocess data
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
    # Remove mentions and links
    text = re.sub(r'@\S+|https?://\S+', '', text)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text.lower())
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and stem remaining words
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    # Join stemmed tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

In [15]:
data['text'] = data['text'].apply(preprocess_text)

In [16]:
# Assign sentiment labels
def get_sentiment_label(sentiment):
    if sentiment == 'positive':
        return 1
    elif sentiment == 'negative':
        return -1
    else:
        return 0

In [17]:
data['sentiment_label'] = data['sentiment'].apply(get_sentiment_label)

In [18]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment_label'], test_size=0.2, random_state=42)

In [19]:
# Extract features
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [20]:
# Train model
model = MultinomialNB()
model.fit(X_train_features, y_train)

MultinomialNB()

In [21]:
# Evaluate model
y_pred = model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.73
