In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def preprocess_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.lower()
    text_tokens = word_tokenize(text)
    filtered_words = [word for word in text_tokens if word not in stopwords.words('english')]
    return ' '.join(filtered_words)

In [4]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data['text'] = data['text'].apply(preprocess_tweet)
    return data

In [9]:
file_path = 'dataset.csv'
data = load_data(file_path)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [21]:
text_clf.fit(X_train, y_train)
predictions = text_clf.predict(X_test)

print(classification_report(y_test, predictions));

                 precision    recall  f1-score   support

            AOC       0.61      0.79      0.68       727
    BarackObama       0.00      0.00      0.00       148
      Cristiano       1.00      0.11      0.20       100
  GretaThunberg       1.00      0.54      0.70       337
HIDEO_KOJIMA_EN       0.88      0.79      0.83       732
         Malala       0.00      0.00      0.00        46
          Oprah       0.00      0.00      0.00       102
   TheEllenShow       0.63      0.80      0.70       994
       elonmusk       1.00      0.20      0.33       359
        garyvee       0.52      0.83      0.64      1311
     jk_rowling       0.97      0.26      0.41       290
       joerogan       0.94      0.68      0.79       661
jordanbpeterson       0.78      0.72      0.75       926
       ladygaga       1.00      0.11      0.21       157
   narendramodi       0.69      0.99      0.81      1355
      neiltyson       1.00      0.10      0.18       218
 richardbranson       0.85    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
report = classification_report(y_test, predictions, output_dict=True)
cm = confusion_matrix(y_test, predictions)

def latex_output(report, cm):
    report_latex = '\\begin{tabular}{lrrr}\n'
    report_latex += '\\hline\n'
    report_latex += '\\textbf{Class} & \\textbf{Precision} & \\textbf{Recall} & \\textbf{F1-Score} \\\\\n'
    report_latex += '\\hline\n'

    for label, metrics in report.items():
        if label != 'accuracy':
            label = label.replace('_', ' ')  # Replace underscores with spaces
            report_latex += f'{label} & {metrics["precision"]:.2f} & {metrics["recall"]:.2f} & {metrics["f1-score"]:.2f} \\\\\n'

    report_latex += '\\hline\n'
    report_latex += '\\end{tabular}\n'


    cm_latex = '\\begin{bmatrix}\n'

    for row in cm:
        row_str = ' & '.join(str(x) for x in row)
        cm_latex += f'{row_str} \\\\\n'

    cm_latex += '\\end{bmatrix}'

    return report_latex, cm_latex


In [23]:
report_latex, cm_latex = latex_output(report, cm)

from IPython.display import display, Math
display(Math(f'\\text{{Confusion Matrix}}\\\\\n{cm_latex}'))


<IPython.core.display.Math object>