Task 04  
Analyze and visualize sentiment patterns in social media data to understand public opinion and attitudes towards specific topics or brands.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords

Load the Dataset

In [2]:
df_train = pd.read_csv('twitter_training.csv', header = None)

In [3]:
df_train.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df_train.columns = ['ID', 'Category', 'Sentiment', 'Text']
df_train.columns

Index(['ID', 'Category', 'Sentiment', 'Text'], dtype='object')

Load the Dataset

In [6]:
df_test = pd.read_csv('twitter_validation.csv', header = None)

In [7]:
df_test.head()

Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [13]:
df_test.columns = ['ID', 'Category', 'Sentiment', 'Text']
df_test.columns

Index(['ID', 'Category', 'Sentiment', 'Text'], dtype='object')

In [14]:
df_train.isnull().sum()

ID             0
Category       0
Sentiment      0
Text         686
dtype: int64

In [11]:
df_test.isnull().sum()


ID           0
Category     0
Sentiment    0
Text         0
dtype: int64

In [15]:
df_train = df_train.dropna()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sonali\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_symbols(text):
    pattern = r'[^A-Za-z\s]'
    text = re.sub(pattern, '', text)
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Clean text column
df_train['Text'] = df_train['Text'].apply(lambda x: remove_html_tags(x))
df_train['Text'] = df_train['Text'].apply(lambda x: remove_symbols(x))

df_test['Text'] = df_test['Text'].apply(lambda x: remove_html_tags(x))
df_test['Text'] = df_test['Text'].apply(lambda x: remove_symbols(x))

  soup = BeautifulSoup(text, 'html.parser')
  soup = BeautifulSoup(text, 'html.parser')


In [17]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

X_train = df_train['Text']
y_train = df_train.Sentiment

X_test = df_test['Text']
y_test = df_test.Sentiment
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.7)

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)

In [18]:
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)

In [19]:
print(f'Acc: {round(score*100, 2)}%')

Acc: 95.2%


In [20]:
confusion_matrix(y_test, y_pred, labels = ['Neutral', 'Positive', 'Negative'])

array([[271,   8,   3],
       [  4, 262,   7],
       [  3,   5, 258]], dtype=int64)