### 1. Importing the Libraries

In [9]:
import pandas as pd
import re
import joblib
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

### 2. Importing Dataset

In [65]:
def loading_tweets(file_name):
    df = pd.read_csv('twitter_training.csv',header=None)
    df = df.rename({0 : 'id', 1: 'company', 2:'sentiment',3:'raw_tweet'},axis=1)

    df = df[df['sentiment'] != 'Irrelevant']
    df = df[df['sentiment'] != 'Neutral']
    df = df.dropna()
    tweets = df['raw_tweet'].values

### 3. Text preprocessing

In [66]:
def preprocessing_tweets(tweets):
    processed_tweets = []

    for tweet in tqdm(tweets):
      new_tweet = []

      # Check if tweet is a string or a list of tokens (e.g., from previous processing)
      if isinstance(tweet, str):
        words = tweet.split()
      elif isinstance(tweet, list):
        words = tweet

      # Iterate through words (including tokens if already split)
      for word in words:
        if isinstance(word, float):
          # Convert float to string with controlled formatting (optional)
          formatted_word = str(word).format("%.2f")  # Two decimal places by default
          # Add symbols if necessary: formatted_word = "$" + formatted_word or "%" + formatted_word + "%"

          # Add the formatted string to the new tweet
          new_tweet.append(formatted_word)
        else:
          # Keep non-float words as they are
          new_tweet.append(word)

      # Join words into a single processed tweet
      processed_tweets.append(" ".join(new_tweet))
    return processed_tweets

### 4.  TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# Encoding tweets
enc_tweets = tfidf.fit_transform(processed_tweets)
df_ = pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())
df_

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,фору,это,юууу,яй,ясс,اللعبه,حبيت,خلاص,٥υ,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5. Saving the model and encoding

In [29]:
joblib.dump(tfidf,'tfidf_model.joblib')
joblib.load('tfidf_model.joblib')

### 6. Preparing Features and Labels

In [30]:
x_train = enc_tweets.toarray()
y_train = pd.get_dummies(df['sentiment']).values[:,1:]

In [31]:
x_train.shape

(43374, 19337)

In [32]:
y_train.shape

(43374, 1)

### 7. Model Training

In [33]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### 7. Preparing Testing Data

In [42]:
df_test = pd.read_csv('twitter_validation.csv',header=None)
df_test = df_test.rename({0 : 'id', 1: 'company', 2:'sentiment',3:'raw_tweet'},axis=1)

df_test = df_test[df_test['sentiment'] != 'Irrelevant']
df_test = df_test[df_test['sentiment'] != 'Neutral']

test_tweets = df_test['raw_tweet'].values

processed_tweets = []

for tweet in tqdm(test_tweets):
    tweet = re.sub(r'[^a-zA-Z]',' ',tweet)
    tweet = tweet.lower()
    tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
    tweet = [stemmer.stem(word) for word in tweet]
    tweet = [word for word in tweet if len(word) !=0]
    tweet = ' '.join(tweet)
    processed_tweets.append(tweet)
    
enc_tweets = tfidf.transform(processed_tweets)
df_ = pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())
df_

100%|████████████████████████████████████████| 543/543 [00:01<00:00, 537.24it/s]


Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,фору,это,юууу,яй,ясс,اللعبه,حبيت,خلاص,٥υ,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
x_test = enc_tweets.toarray()
y_test = pd.get_dummies(df_test['sentiment'])['Positive'].values

In [50]:
y_pred = model.predict(x_test)

In [51]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.861878453038674

In [57]:
df[df['sentiment'] == 'Negative'].shape

(22542, 4)

In [58]:
df[df['sentiment'] == 'Positive'].shape

(20832, 4)

### Model Evaluation

In [67]:
print('Training Accuracy:',round(accuracy_score(model.predict(x_train),y_train)*100,3))
print('Testing Accuracy:',round(accuracy_score(model.predict(x_test),y_test)*100,3))

Training Accuracy: 91.407
Testing Accuracy: 86.188


### 8. Model Saving

In [60]:
import pickle

# Use a context manager to automatically close the file
with open('model.model', 'wb') as f:
    pickle.dump(model, f)

In [62]:
pickle.load(open('model.model','rb'))

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,model.predict(x_test))

0.861878453038674