# Text Sentimental Analysis Using Naive Bayes

## 1 Importing Libraries

In [1]:
import re
import joblib
import pickle
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = PorterStemmer()

## 2. Defining Functions

In [2]:
def loading_tweets(file_name):
    df = pd.read_csv(file_name, header = None)
    df = df.rename({0 : 'id', 1 : 'company', 2 : 'sentiment', 3 : 'raw_tweet'}, axis = 1)
    
    # df = df[df['sentiment'] != 'Irrelevant']
    # df = df[df['sentiment'] != 'Neutral']
    df = df.dropna()

    return df['raw_tweet'].values, df

def preprocessing_tweets(tweets):
    
    processed_tweets = []
    for tweet in tqdm(tweets):      
        tweet = re.sub(r'[^a-zA-Z]',' ', tweet)
        tweet = tweet.lower()
        tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
        tweet = [stemmer.stem(word) for word in tweet]
        tweet = [word for word in tweet if len(word) != 0]
        tweet = ' '.join(tweet)
        
        processed_tweets.append(tweet)
        
    return processed_tweets
        

## 3. Prepairing Training Data

In [3]:
tweets, df = loading_tweets('D:\\GFG Data Science and Machine Learning\\Machine Learning\\ML Datasets\\twitter_training.csv')
processed_tweets = preprocessing_tweets(tweets)

tfidf = TfidfVectorizer()
enc_tweets = tfidf.fit_transform(processed_tweets)

x_train = enc_tweets.toarray()
y_train = pd.get_dummies(df['sentiment']).astype(int).values[:, -1:]   ## | 0 -> negative |  1 -> positive |

100%|███████████████████████████████████████████████████████████████████████████| 73996/73996 [11:59<00:00, 102.91it/s]


## 4. Prepairing Testing Data

In [13]:
test_tweets, df_test = loading_tweets('D:\\GFG Data Science and Machine Learning\\Machine Learning\\ML Datasets\\twitter_validation.csv')
test_processed_tweets = preprocessing_tweets(test_tweets)

test_enc_tweets = tfidf.transform(test_processed_tweets)

x_test = test_enc_tweets.toarray()
y_test = pd.get_dummies(df_test['sentiment']).astype(int).values[:, -1:]  ## | 0 -> negative |  1 -> positive |


100%|██████████████████████████████████████████████████████████████████████████████| 1001/1001 [00:11<00:00, 88.10it/s]


## 5. Model Training

In [14]:
model = MultinomialNB()
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


## 6. Model Evaluation

In [15]:
print('Training Accuracy :', round(accuracy_score(model.predict(x_train), y_train)*100,2))
print('Testing Accuracy  :', round(accuracy_score(model.predict(x_test), y_test)*100,2))

Training Accuracy : 85.46
Testing Accuracy  : 82.82


## 7. Saving the Model And Encoder

In [16]:
joblib.dump(tfidf, 'tfidf_model2.joblib')      ## TF-IDF Encoder
pickle.dump(model, open("model2.mdl", "wb"))     ## Model

print('Encoder is saved with name tfidf_model2.joblib')
print('Model saved with name model2.mdl')

Encoder is saved with name tfidf_model2.joblib
Model saved with name model2.mdl


## 8. Testing for Single Tweets

In [19]:
def preprocess_single_tweet(tweet):
    tweet = re.sub(r'[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tweet = [word for word in tweet.split(' ') if word and word not in stopwords.words('english')]
    tweet = [stemmer.stem(word) for word in tweet]
    tweet = ' '.join(tweet)
    return tweet

In [20]:
mdl = pickle.load(open("model2.mdl", 'rb'))
jbl = joblib.load('tfidf_model2.joblib')

In [21]:
text = "The customer service was terrible. I waited over an hour, and they still couldn’t resolve my issue. Definitely not coming back!"
res_text = preprocess_single_tweet(text)

res_enc_tweets = jbl.transform([res_text])
res_test = res_enc_tweets.toarray()

In [22]:
mdl.predict(res_test)[0]

np.int64(0)

In [23]:
res_test

array([[0., 0., 0., ..., 0., 0., 0.]])