In [22]:
import pandas as pd

csv_file_path = "C:\\Users\\SATWIK M BADIGER\\Desktop\\projects\\ML\\Sentimeter\\dataset.csv"

df = pd.read_csv(csv_file_path)

print(df)

                                                     Text Language      Label
0       @Charlie_Corley @Kristine1G @amyklobuchar @Sty...       en  litigious
1       #BadBunny: Como dos gotas de agua: Joven se di...       es   negative
2       https://t.co/YJNiO0p1JV Flagstar Bank disclose...       en  litigious
3       Rwanda is set to host the headquarters of Unit...       en   positive
4       OOPS. I typed her name incorrectly (today’s br...       en  litigious
...                                                   ...      ...        ...
937849            @Juice_Lemons in the dark. it’s so good       en   positive
937850  8.SSR &amp; Disha Salian case should be solved...       en   negative
937851  *ACCIDENT:  Damage Only* - Raleigh Fire Depart...       en   negative
937852  @reblavoie So happy for her! She’s been incred...       en   positive
937853                         I'm lost and I'm found but       en   negative

[937854 rows x 3 columns]


In [23]:
#missing values
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
Text         0
Language    23
Label        0
dtype: int64


In [24]:
print("Preprocessed data:")
print(df.head())

Preprocessed data:
                                                Text Language      Label
0  @Charlie_Corley @Kristine1G @amyklobuchar @Sty...       en  litigious
1  #BadBunny: Como dos gotas de agua: Joven se di...       es   negative
2  https://t.co/YJNiO0p1JV Flagstar Bank disclose...       en  litigious
3  Rwanda is set to host the headquarters of Unit...       en   positive
4  OOPS. I typed her name incorrectly (today’s br...       en  litigious


In [25]:
#total rows
print("Number of rows:", len(df))

#only en language
df = df[df['Language'] == 'en']

#remove duplicates
df.drop_duplicates(inplace=True)

#number of rows after filtering
print("Number of rows after filtering:", len(df))

#print the preprocessed df
print("\nPreprocessed data:")
print(df.head())

Number of rows: 937854
Number of rows after filtering: 867178

Preprocessed data:
                                                Text Language      Label
0  @Charlie_Corley @Kristine1G @amyklobuchar @Sty...       en  litigious
2  https://t.co/YJNiO0p1JV Flagstar Bank disclose...       en  litigious
3  Rwanda is set to host the headquarters of Unit...       en   positive
4  OOPS. I typed her name incorrectly (today’s br...       en  litigious
5  It sucks for me since I'm focused on the natur...       en   negative


In [26]:
import re

#keep only alphabets
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

from nltk.tokenize import word_tokenize

df['Text'] = df['Text'].apply(word_tokenize)

print("\nTokenized data:")
print(df.head())


Tokenized data:
                                                Text Language      Label
0  [CharlieCorley, KristineG, amyklobuchar, Style...       en  litigious
2  [httpstcoYJNiOpJV, Flagstar, Bank, discloses, ...       en  litigious
3  [Rwanda, is, set, to, host, the, headquarters,...       en   positive
4  [OOPS, I, typed, her, name, incorrectly, today...       en  litigious
5  [It, sucks, for, me, since, Im, focused, on, t...       en   negative


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorization
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform([' '.join(tokens) for tokens in df['Text']])

#test and train data splitting
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['Label'], test_size=0.2, random_state=42)

#training Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

#making predictions
y_pred = clf.predict(X_test)

#model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8874743421204364


In [28]:
from sklearn.metrics import classification_report

#print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

   litigious       0.90      0.89      0.89     35713
    negative       0.84      0.92      0.88     48654
    positive       0.91      0.87      0.89     49389
 uncertainty       0.92      0.87      0.90     39680

    accuracy                           0.89    173436
   macro avg       0.89      0.89      0.89    173436
weighted avg       0.89      0.89      0.89    173436



In [29]:
user_input = "I am happy"

#preprocess the user input
user_input_tokens = word_tokenize(re.sub(r'[^a-zA-Z\s]', '', user_input.lower()))

# Vectorization
user_input_bow = vectorizer.transform([' '.join(user_input_tokens)])

#making prediction
user_input_pred = clf.predict(user_input_bow)

#print result
print("Predicted sentiment:", user_input_pred[0])

Predicted sentiment: positive
