In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import joblib

In [15]:
# Loading the dataframe
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [16]:
# Map target values to sentiment labels (-1, 0, 1)
df['sentiment'] = df['target'].apply(lambda x: -1 if x == 0 else (1 if x == 4 else 0))

# Randomly discard 99% of the dataset
df = df.sample(frac=0.01, random_state=42)

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Remove unnecessary columns
train_texts = train_df['text'].values
train_sentiments = train_df['sentiment'].values
test_texts = test_df['text'].values
test_sentiments = test_df['sentiment'].values

In [17]:
# Initialize NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# Train a Support Vector Machine (SVM) classifier
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(train_features, train_sentiments)

# Predict sentiment on the test set
predictions = svm_model.predict(test_features)

# Evaluate the model
accuracy = accuracy_score(test_sentiments, predictions)
classification_rep = classification_report(test_sentiments, predictions)

print(f'Accuracy: {accuracy:.2f}')
print(f'Classification Report:\n{classification_rep}')

Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

          -1       0.73      0.69      0.71      1587
           1       0.71      0.75      0.73      1613

    accuracy                           0.72      3200
   macro avg       0.72      0.72      0.72      3200
weighted avg       0.72      0.72      0.72      3200



In [19]:
# Save the trained SVM model to a specific path
svm_model_path = "./svm_model.pkl"
joblib.dump(svm_model, svm_model_path)

# Save the TF-IDF vectorizer to a specific path
vectorizer_path = "./tfidf_vectorizer.pkl"
joblib.dump(vectorizer, vectorizer_path)


['./tfidf_vectorizer.pkl']

In [20]:
# Load the trained SVM model
svm_model = joblib.load('svm_model.pkl')

# Load the TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load your dataset (replace with your actual data loading code)
csv_path = './abcnews-date-text.csv'
news_headlines = load_news_headlines(csv_path)

# Preprocess the news headlines
preprocessed_headlines_full = [preprocess_text(headline) for headline in news_headlines]
preprocessed_headlines = preprocessed_headlines_full[0:1000]

# Convert preprocessed headlines to TF-IDF features
headline_features = vectorizer.transform(preprocessed_headlines)

# Predict the sentiment polarity using the SVM model
headline_polarities = svm_model.predict(headline_features)

# Map sentiment polarity to a scale of -1 to 1 (assuming -1 is negative, 1 is positive)
polarities_scaled = (headline_polarities + 1) / 2