Twitter Sentiment Analysis with Passive Aggressive Classifier
The Passive Aggressive Classifier is like a stubborn but adaptable learner in the world of machine learning. It's designed to quickly adapt to new data streams while being resistant to changes that might not be significant. Imagine it as a student who learns from new examples but doesn't change their understanding drastically unless absolutely necessary. When faced with a new data point, it decides whether to update its model aggressively if the new information contradicts its current understanding, or passively if it aligns with what it already knows. This flexibility makes it particularly useful for tasks like text classification or online learning scenarios where data comes in sequentially.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

In [None]:
df_train = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv', header = None)

In [None]:
df_train.head()

In [None]:
df_train.columns = ['ID', 'Category', 'Sentiment', 'Text']

In [None]:
df_train.columns

In [None]:
df_test = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', header = None)

In [None]:
df_test.head()

In [None]:
df_test.columns = ['ID', 'Category', 'Sentiment', 'Text']

In [None]:
df_test.columns

In [None]:
df_train.isnull().sum()

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train = df_train.dropna()

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_symbols(text):
    pattern = r'[^A-Za-z\s]'
    text = re.sub(pattern, '', text)
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Clean text column
df_train['Text'] = df_train['Text'].apply(lambda x: remove_html_tags(x))
df_train['Text'] = df_train['Text'].apply(lambda x: remove_symbols(x))

df_test['Text'] = df_test['Text'].apply(lambda x: remove_html_tags(x))
df_test['Text'] = df_test['Text'].apply(lambda x: remove_symbols(x))

In [None]:
X_train = df_train['Text']
y_train = df_train.Sentiment

X_test = df_test['Text']
y_test = df_test.Sentiment

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

In [None]:
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

In [None]:
print(f'Acc: {round(score*100, 2)}%')

In [None]:
confusion_matrix(y_test, y_pred, labels = ['Neutral', 'Positive', 'Negative'])