In [15]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import joblib # For saving the model

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Load the dataset (replace 'your_dataset.csv' with the actual file path)
try:
    df = pd.read_csv('all-data.csv', encoding='latin-1', header=None)
except FileNotFoundError:
    print("Error: Dataset not found. Please replace 'your_dataset.csv' with the correct file path.")
    # Create a dummy DataFrame for demonstration purposes
    data = {'text': ['This is a positive review.', 'This is a negative review.', 'This is a neutral review.'],
            'sentiment': ['positive', 'negative', 'neutral']}
    df = pd.DataFrame(data)

# Rename columns
df.columns = ['sentiment', 'text']

# Preprocess the text data
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove non-alphabetic characters
    text = text.lower() # Convert to lowercase
    text = text.split() # Tokenize
    stopwords_list = set(stopwords.words('english'))
    text = [word for word in text if word not in stopwords_list] # Remove stopwords
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text] # Stemming
    return ' '.join(text)

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Limit features to 5000
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
print(df.columns)

Index(['sentiment', 'text', 'cleaned_text'], dtype='object')


In [18]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [19]:
# Evaluate the model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7494845360824742
Classification Report:
              precision    recall  f1-score   support

    negative       0.78      0.46      0.58       110
     neutral       0.74      0.95      0.83       571
    positive       0.78      0.47      0.59       289

    accuracy                           0.75       970
   macro avg       0.77      0.63      0.67       970
weighted avg       0.76      0.75      0.73       970



In [20]:
# Save the trained model and vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']