In [1]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from tqdm import tqdm
import joblib
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from tqdm import tqdm

tfidf=TfidfVectorizer()
stemmer=PorterStemmer()

In [2]:
def loading_tweets(file_name):
    df=pd.read_csv(file_name,header=None)
    df=df.rename({0:'id',1:'company',2:'sentiment',3:'raw_tweets'},axis=1)
    df=df[df['sentiment']!='Irrelavent']
    df=df[df['sentiment']!='Neutral']
    df=df.dropna()
    
    return df['raw_tweets'].values,df

In [3]:
def processed_tweets(tweets):
    processed_tweets=[]
    for tweet in tqdm(tweets):
        tweet=re.sub(r'[^a-zA-Z]',' ',tweet)
        tweet=tweet.lower()
        tweet=[word for word in tweet.split(' ') if not word in stopwords.words('english')]
        tweet=[stemmer.stem(word) for word in tweet]
        tweet=[word for word in tweet if len(word)!=0]
        tweet=' '.join(tweet)
        processed_tweets.append(tweet)
    return processed_tweets

### Preparing Training data

In [4]:
train_tweets,df_train=loading_tweets('twitter_training.csv/twitter_training.csv')

In [5]:
train_tweets=processed_tweets(train_tweets)

100%|██████████| 55888/55888 [11:49<00:00, 78.79it/s]   


In [6]:
# Transform test tweets using the pre-fitted TF-IDF vectorizer
train_tweets = tfidf.fit_transform(train_tweets)

In [7]:
# Convert the TF-IDF matrix to a dense array for train data
x_train = train_tweets.toarray()

# Convert sentiment labels to one-hot encoded vectors for training
y_train = pd.get_dummies(df_train['sentiment']).values[:, 1:]

In [8]:
test_tweets,df_test=loading_tweets('twitter_Validation.csv')

In [9]:
test_tweets=processed_tweets(test_tweets)

100%|██████████| 715/715 [00:06<00:00, 106.17it/s]


In [10]:
# Transform test tweets using the pre-fitted TF-IDF vectorizer
test_tweets = tfidf.fit_transform(test_tweets)

In [11]:
# Convert the TF-IDF matrix to a dense array for test data
x_test = test_tweets.toarray()

# Convert sentiment labels to one-hot encoded vectors for testing
y_test = pd.get_dummies(df_test['sentiment']).values[:, 1:]

In [12]:
from sklearn.preprocessing import LabelEncoder

# Assuming df_train is your DataFrame and 'sentiment' is the column to be encoded
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['sentiment'])
y_train = y_train.astype(int)

print(y_train)

[2 2 2 ... 2 2 2]


In [13]:
from sklearn.preprocessing import LabelEncoder
# Assuming df_train is your DataFrame and 'sentiment' is the column to be encoded
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(df_test['sentiment'])
y_test = y_test.astype(int)

print(y_test)

[0 1 1 1 2 2 2 1 2 2 1 1 2 2 1 2 1 1 0 1 1 0 0 1 2 2 1 2 1 0 2 2 2 1 1 1 2
 1 1 2 2 2 2 2 1 0 1 2 2 0 1 1 0 1 2 1 1 2 2 0 2 0 2 2 2 1 0 1 1 2 0 0 1 1
 1 2 1 2 2 2 2 1 1 1 2 2 1 1 2 1 1 2 1 0 2 2 2 0 0 0 0 0 0 0 2 2 0 2 1 0 1
 2 1 1 0 0 0 1 1 1 2 2 2 2 0 2 1 1 2 2 0 0 2 2 0 1 1 1 1 2 2 2 2 2 1 2 2 0
 0 1 1 0 0 1 2 2 1 0 1 2 2 1 0 0 2 2 1 2 0 0 0 1 2 1 0 0 2 2 0 0 2 1 1 2 2
 2 2 2 1 2 1 1 2 2 0 1 0 2 0 1 1 2 2 1 1 1 2 1 2 1 2 1 2 1 0 2 1 1 0 2 1 2
 0 2 2 2 2 2 1 1 2 1 2 0 2 0 1 0 0 1 2 1 1 0 0 2 0 2 0 0 1 1 1 2 0 1 2 0 1
 1 1 2 0 1 2 2 0 0 2 2 2 1 2 1 1 0 1 2 2 0 2 2 1 0 0 2 2 1 1 1 0 2 2 0 1 0
 1 2 2 2 2 0 1 2 2 1 2 1 1 1 1 1 2 2 1 1 1 1 0 2 0 0 1 2 2 2 0 2 0 1 2 0 0
 0 0 0 0 1 1 2 0 0 2 0 0 1 1 2 2 1 2 2 2 2 0 2 0 1 2 2 2 0 2 0 2 0 1 1 1 1
 2 1 1 2 2 1 1 2 1 1 1 2 1 1 1 0 1 1 2 0 0 1 1 2 1 2 2 1 2 0 1 1 0 1 2 1 1
 1 1 2 1 2 1 2 1 1 1 1 2 1 0 1 1 2 1 2 0 0 0 1 0 1 1 0 0 2 1 2 0 0 2 2 2 0
 2 1 2 0 0 1 2 1 0 1 0 0 0 2 0 1 2 1 1 0 2 2 0 2 0 1 0 1 2 0 1 2 0 0 2 2 1
 1 2 1 2 1 2 2 0 2 1 1 2 

### Model training

In [14]:
mdl=MultinomialNB()
mdl.fit(x_train,y_train)

### Model Evaluation

In [15]:
print('Training accuracy',round(accuracy_score(mdl.predict(x_train),y_train)*100,3))
print('Test accuracy',round(accuracy_score(mdl.predict(x_test),y_test)*100,3))

Training accuracy 82.318


ValueError: X has 3236 features, but MultinomialNB is expecting 17254 features as input.

### Model saving

In [16]:
joblib.dump(tfidf,'tfidf_model.joblib')       # saving tfidf vectorizer
pickle.dump(mdl,open('model.mdl','wb'))       # svaing mdl model