In [92]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [93]:
# reading all csv files.
df = pd.concat(
    map(
        pd.read_csv,
        [
            '../datasets\\1\\xaa.csv',
            '../datasets\\1\\xab.csv',
            '../datasets\\1\\xac.csv',
            '../datasets\\1\\xad.csv',
            '../datasets\\1\\xae.csv',
            '../datasets\\1\\xad.csv',
            '../datasets\\1\\xag.csv',
            '../datasets\\1\\xah.csv'
        ],
    ),
    ignore_index = True
)
df

Unnamed: 0,Text,Language,Label
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,en,litigious
1,#BadBunny: Como dos gotas de agua: Joven se di...,es,negative
2,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,en,litigious
3,Rwanda is set to host the headquarters of Unit...,en,positive
4,OOPS. I typed her name incorrectly (today’s br...,en,litigious
...,...,...,...
938357,@Juice_Lemons in the dark. it’s so good,en,positive
938358,8.SSR &amp; Disha Salian case should be solved...,en,negative
938359,*ACCIDENT: Damage Only* - Raleigh Fire Depart...,en,negative
938360,@reblavoie So happy for her! She’s been incred...,en,positive


In [94]:
# checking for null values
null_items = df[df.isnull().any(axis=1)]
null_indices = null_items.index.tolist()
print(f" null indices : {null_indices}")
null_items

 null indices : [56424, 58817, 100497, 119404, 172545, 309831, 360746, 380989, 468254, 511427, 518486, 530135, 545587, 575863, 627999, 715264, 759867, 794002, 823822, 865082, 876122, 889656, 898130, 915026]


Unnamed: 0,Text,Language,Label
56424,“If you don’t build your dream.Someone will hi...,,positive
58817,Open Digital Platforms to Spur Innovation. via...,,positive
100497,https://twitter.com/LyricsAjj/status/154189078...,,negative
119404,* Everyone danced with Sweet! They're totally ...,,negative
172545,Crews are responding to a motor vehicle accide...,,negative
309831,Crews are responding to a motor vehicle accide...,,negative
360746,https://twitter.com/Tian_A1/status/15419122211...,,positive
380989,https://twitter.com/Silas_Breach/status/154188...,,litigious
468254,Crews are responding to a motor vehicle accide...,,negative
511427,https://twitter.com/ademo68038207/status/15419...,,negative


In [95]:
# removing rows having null values
df = df.drop(null_indices)

In [96]:
#total rows
print("Number of rows:", len(df))

#only en language
df = df[df['Language'] == 'en']

#remove duplicates
df.drop_duplicates(inplace=True)

#number of rows after filtering
print("Number of rows after filtering:", len(df))

#print the preprocessed df
print("\nPreprocessed data:")
print(df.head())

Number of rows: 938338
Number of rows after filtering: 753698

Preprocessed data:
                                                Text Language      Label
0  @Charlie_Corley @Kristine1G @amyklobuchar @Sty...       en  litigious
2  https://t.co/YJNiO0p1JV Flagstar Bank disclose...       en  litigious
3  Rwanda is set to host the headquarters of Unit...       en   positive
4  OOPS. I typed her name incorrectly (today’s br...       en  litigious
5  It sucks for me since I'm focused on the natur...       en   negative


In [97]:
#keep only alphabets
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df['Text'] = df['Text'].apply(word_tokenize)

print("\nTokenized data:")
print(df.head())


Tokenized data:
                                                Text Language      Label
0  [CharlieCorley, KristineG, amyklobuchar, Style...       en  litigious
2  [httpstcoYJNiOpJV, Flagstar, Bank, discloses, ...       en  litigious
3  [Rwanda, is, set, to, host, the, headquarters,...       en   positive
4  [OOPS, I, typed, her, name, incorrectly, today...       en  litigious
5  [It, sucks, for, me, since, Im, focused, on, t...       en   negative


In [98]:
# Vectorization
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform([' '.join(tokens) for tokens in df['Text']])

#test and train data splitting
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['Label'], test_size=0.2, random_state=42)

#training Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

#making predictions
y_pred = clf.predict(X_test)

#model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8887289372429349


In [99]:
#print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

   litigious       0.90      0.89      0.89     30988
    negative       0.84      0.92      0.88     42068
    positive       0.91      0.87      0.89     43053
 uncertainty       0.92      0.87      0.90     34631

    accuracy                           0.89    150740
   macro avg       0.89      0.89      0.89    150740
weighted avg       0.89      0.89      0.89    150740



In [100]:
user_input = "I am happy"

#preprocess the user input
user_input_tokens = word_tokenize(re.sub(r'[^a-zA-Z\s]', '', user_input.lower()))

# Vectorization
user_input_bow = vectorizer.transform([' '.join(user_input_tokens)])

#making prediction
user_input_pred = clf.predict(user_input_bow)

#print result
print("Predicted sentiment:", user_input_pred[0])

Predicted sentiment: positive
