In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Loading the dataframe
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [3]:
# Map target values to sentiment labels (-1, 0, 1)
df['sentiment'] = df['target'].apply(lambda x: -1 if x == 0 else (1 if x == 4 else 0))

# Randomly discard 90% of the dataset
df = df.sample(frac=0.1, random_state=42)

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Remove unnecessary columns
train_texts = train_df['text'].values
train_sentiments = train_df['sentiment'].values
test_texts = test_df['text'].values
test_sentiments = test_df['sentiment'].values

In [4]:
# Initialize NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
train_features = vectorizer.fit_transform(train_texts)
test_features = vectorizer.transform(test_texts)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Train a Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(train_features, train_sentiments)

# Predict sentiment on the test set
predictions = svm_classifier.predict(test_features)

# Evaluate the model
accuracy = accuracy_score(test_sentiments, predictions)
classification_rep = classification_report(test_sentiments, predictions)

print(f'Accuracy: {accuracy:.2f}')
print(f'Classification Report:\n{classification_rep}')

Accuracy: 0.77
Classification Report:
              precision    recall  f1-score   support

          -1       0.78      0.74      0.76     15878
           1       0.75      0.79      0.77     16122

    accuracy                           0.77     32000
   macro avg       0.77      0.77      0.77     32000
weighted avg       0.77      0.77      0.77     32000

