### 1. Importing the Libraries

In [9]:
import pandas as pd
import re
import joblib
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

### 2. Importing Dataset

In [10]:
df = pd.read_csv('twitter_training.csv',header=None)
df = df.rename({0 : 'id', 1: 'company', 2:'sentiment',3:'raw_tweet'},axis=1)

df = df[df['sentiment'] != 'Irrelevant']
df = df[df['sentiment'] != 'Neutral']

tweets = df['raw_tweet'].values

In [12]:
df.isnull().sum()

id             0
company        0
sentiment      0
raw_tweet    361
dtype: int64

### 3. Text preprocessing

In [27]:
processed_tweets = []

for tweet in tqdm(tweets):
  new_tweet = []

  # Check if tweet is a string or a list of tokens (e.g., from previous processing)
  if isinstance(tweet, str):
    words = tweet.split()
  elif isinstance(tweet, list):
    words = tweet

  # Iterate through words (including tokens if already split)
  for word in words:
    if isinstance(word, float):
      # Convert float to string with controlled formatting (optional)
      formatted_word = str(word).format("%.2f")  # Two decimal places by default
      # Add symbols if necessary: formatted_word = "$" + formatted_word or "%" + formatted_word + "%"

      # Add the formatted string to the new tweet
      new_tweet.append(formatted_word)
    else:
      # Keep non-float words as they are
      new_tweet.append(word)

  # Join words into a single processed tweet
  processed_tweets.append(" ".join(new_tweet))

print(processed_tweets)



100%|█████████████████████████████████| 43374/43374 [00:00<00:00, 255144.28it/s]
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# Encoding tweets
enc_tweets = tfidf.fit_transform(processed_tweets)
df_ = pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())
df_

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,фору,это,юууу,яй,ясс,اللعبه,حبيت,خلاص,٥υ,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5. Saving the model and encoding

In [29]:
joblib.dump(tfidf,'tfidf_model.joblib')
joblib.load('tfidf_model.joblib')

### 6. Preparing Features and Labels

In [30]:
x_train = enc_tweets.toarray()
y_train = pd.get_dummies(df['sentiment']).values[:,1:]

In [31]:
x_train.shape

(43374, 19337)

In [32]:
y_train.shape

(43374, 1)

### 7. Model Training

In [33]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### 7. Preparing Testing Data

In [35]:
df_test = pd.read_csv('twitter_validation.csv',header=None)
df_test = df.rename({0 : 'id', 1: 'company', 2:'sentiment',3:'raw_tweet'},axis=1)

df_test = df_test[df_test['sentiment'] != 'Irrelevant']
df_test = df_test[df_test['sentiment'] != 'Neutral']

test_tweets = df_test['raw_tweet'].values