In [None]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
import pickle


### Download the stopwords list if not already available

In [None]:
nltk.download('stopwords')

### Load the dataset

In [None]:
data = pd.read_csv('train_full_sentiments.csv')

### Map the sentiment labels to 0 (negative), 1 (neutral), and 2 (positive)

In [None]:
def map_sentiment(sentiment):
    if sentiment in [0, 1]:
        return 0  # Negative
    elif sentiment == 2:
        return 1  # Neutral
    else:
        return 2  # Positive

data['Sentiment'] = data['Sentiment'].apply(map_sentiment)

### Function to clean text: removing special characters, converting to lowercase, and removing stopwords

In [None]:
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  
    text = text.lower() 
    text = ' '.join(word for word in text.split() if word not in stop_words)  
    return text

data['Phrase'] = data['Phrase'].apply(clean_text)

### Split the data into training and testing sets

In [None]:
X = data['Phrase']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Use TF-IDF vectorizer with both unigrams and bigrams, and a reduced number of features

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Train a Logistic Regression model (faster than Random Forest)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

### Evaluate the Logistic Regression model

In [None]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
classification_report_lr = classification_report(y_test, y_pred)

print('Logistic Regression Model Accuracy:', accuracy)
print(classification_report_lr)

### Save the model and TF-IDF vectorizer for later use in the Streamlit app

In [None]:
with open('sentiment_model_lr.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)