In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
df = pd.read_csv(url)

# Preprocess data
df = df[['label', 'tweet']]
df['tweet'] = df['tweet'].str.lower()
stop_words = set(stopwords.words('english'))
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.isalpha() and word not in stop_words]))

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)

# Evaluate Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

# Train and evaluate other models
models = {
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=200)
}

results = {}
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

# Output results
print("Naive Bayes Performance:")
print(f"Accuracy: {accuracy_nb}, Precision: {precision_nb}, Recall: {recall_nb}, F1: {f1_nb}\n")
print("Other Models Performance:")
for model, scores in results.items():
    print(f"{model}: {scores}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Naive Bayes Performance:
Accuracy: 0.9563585171281088, Precision: 0.9526290290027324, Recall: 0.9563585171281088, F1: 0.9507765656882341

Other Models Performance:
SVM: {'Accuracy': 0.9541686219302362, 'Precision': 0.9526433888003272, 'Recall': 0.9541686219302362, 'F1': 0.9454740972984487}
Random Forest: {'Accuracy': 0.9579227279837322, 'Precision': 0.9550014617540478, 'Recall': 0.9579227279837322, 'F1': 0.9522526062728772}
Logistic Regression: {'Accuracy': 0.9585484123259815, 'Precision': 0.955340681299677, 'Recall': 0.9585484123259815, 'F1': 0.95359227008542}
