### 1. Importing Libraries

In [48]:
import pandas as pd
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

### 2. Importing Dataset

In [49]:
df = pd.read_csv('twitter_validation.csv',header=None)
df = df.rename({0 : 'id', 1: 'company', 2:'sentiment',3:'raw_tweet'},axis=1)

df = df[df['sentiment'] != 'Irrelevant']
df = df[df['sentiment'] != 'Neutral']

tweets = df['raw_tweet'].values

### 3. Text preprocessing

In [67]:
processed_tweets = []

for tweet in tweets:
    #print(tweet,'\n')
    # Remove special characters
    tweet = re.sub(r'[^a-zA-Z]',' ',tweet)
    #print(tweet,'\n')
       
    # lowercase
    tweet = tweet.lower()
    #print(tweet,'\n')
    
    # Removing stopwords
    tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
    #print(tweet,'\n')
    
    # Stem(Root) words
    tweet = [stemmer.stem(word) for word in tweet]
    #print(tweet,'\n')
    
    # Removing null
    tweet = [word for word in tweet if len(word) !=0]
    #print(tweet,'\n')
    
    tweet = ' '.join(tweet)
    #print(tweet,'\n')
    
    processed_tweets.append(tweet)

In [68]:
len(processed_tweets)

543

### 4. TF-IDF

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
# Encoding tweets
enc_tweets = tfidf.fit_transform(processed_tweets)
df_ = pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())
df_

Unnamed: 0,abil,abl,absolut,absurd,access,accessibleatx,accomplish,account,aceofpyrit,achiev,...,ziryhrf,zlcc,zone,zoom,zqw,ztc,ztl,zukf,zy,zyot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193864,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
enc_tweets.toarray().shape

(543, 2499)

### 5. Saving the model and encodings

In [65]:
# Saving the model
joblib.dump(tfidf,'tfidf_model.joblib')
df.to_csv('tfidf_enc.csv',index=False)

print('Model is saved with name tfidf_model.joblib!')
print('Encodings are saved with name tfidf_enc.csv')

Model is saved with name tfidf_model.joblib!
Encodings are saved with name tfidf_enc.csv


In [66]:
# loading the model
joblib.load('tfidf_model.joblib')

### 6. Preparing Features and Labels

In [74]:
x = enc_tweets.toarray()
y = pd.get_dummies(df['sentiment']).values[:,1:]

### 7.Split the dataset into training and testing

In [75]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2)

### 8. Model Training

In [82]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### 9. Model Evaluation

In [85]:
y_pred = model.predict(x_test)

In [88]:
pos = 0
neg = 0
for i,j in zip(y_test,y_pred):
    if(i[0]==j):
        pos += 1
    else:
        neg += 1
print('Accuracy:',round(pos/(pos + neg),2)*100)

Accuracy: 78.0


In [91]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7798165137614679

In [89]:
len(df[df['sentiment'] == 'Negative'])

266

In [90]:
len(df[df['sentiment'] == 'Positive'])

277