# SENTIMENT ANALYSIS

## 1. Importing the required libraries

In [21]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## 2. Loading the dataset

In [22]:
# read the dataset
data = pd.read_csv('Merged Twitter Data.csv')
# view the top rows
data.head()

Unnamed: 0.1,Unnamed: 0,date,user_name,text,user_followers,sentiment
0,0,12/12/2017,karlyndrake2617,RT @realsheepwolf: 👉 $INTV 👈 SEE WHY THIS IS A...,0.0,positive
1,1,12/12/2017,marketranger,Nearly $70M of Bitcoin Hacked From Crypto Mini...,2040.0,positive
2,2,12/12/2017,ProgrammingWrld,RT @BitcoinWrld: Shorting #Bitcoin a challenge...,23033.0,positive
3,3,12/12/2017,Tatoshak,#RPT #bounty #bitcoin #cryptocurrency #blockch...,12430.0,neutral
4,4,12/12/2017,MarkMarkafv,@RonPaul Answer is clear: If you bought $1000 ...,25.0,positive


In [23]:
data.tail()

Unnamed: 0.1,Unnamed: 0,date,user_name,text,user_followers,sentiment
67305,67291,2021-04-15 08:26:37,aWebAnalysis | Crypto,"Bitcoin BTC Current Price:\n$43,965.35\n1 Hour...",1877.0,neutral
67306,67292,2021-04-15 08:26:37,DFS Misko,So excited about #Bitcoin. This is the future....,222.0,positive
67307,67293,2021-04-15 08:26:37,Jay Davila,New #crypto #magazine hits the streets soon! I...,116.0,positive
67308,67294,2021-04-15 08:26:37,CryptoPennyCO 🚀💰🤷🏻‍♂️ 2025,On CNBC to comment on Tesla's (TSLA +0.8%) dis...,2750.0,neutral
67309,67295,2021-04-15 08:26:37,Crypto Enthusiast (Do what is right .... becau...,ALERT!! #Reddcoin #Redd #RDD IS ON THE MOVE!!!...,90.0,neutral


In [24]:
data.sentiment.value_counts()

neutral     31284
positive    28447
negative     7554
Name: sentiment, dtype: int64

In [25]:
data = data[["text","sentiment"]]

def int_to_string(sentiment):
    if sentiment == 'positive':
        return 0
    else:
        return 1
data.sentiment = data.sentiment.apply(int_to_string)
data.sentiment = pd.to_numeric(data.sentiment)
data.head()

Unnamed: 0,text,sentiment
0,RT @realsheepwolf: 👉 $INTV 👈 SEE WHY THIS IS A...,0
1,Nearly $70M of Bitcoin Hacked From Crypto Mini...,0
2,RT @BitcoinWrld: Shorting #Bitcoin a challenge...,0
3,#RPT #bounty #bitcoin #cryptocurrency #blockch...,1
4,@RonPaul Answer is clear: If you bought $1000 ...,0


In [26]:
data.tail()

Unnamed: 0,text,sentiment
67305,"Bitcoin BTC Current Price:\n$43,965.35\n1 Hour...",1
67306,So excited about #Bitcoin. This is the future....,0
67307,New #crypto #magazine hits the streets soon! I...,0
67308,On CNBC to comment on Tesla's (TSLA +0.8%) dis...,1
67309,ALERT!! #Reddcoin #Redd #RDD IS ON THE MOVE!!!...,1


In [27]:
data.sentiment.value_counts()

1    38863
0    28447
Name: sentiment, dtype: int64

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67310 entries, 0 to 67309
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       67296 non-null  object
 1   sentiment  67310 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [29]:
data.isna().sum()

text         14
sentiment     0
dtype: int64

In [30]:
data = data.dropna()
data.isna().sum()

text         0
sentiment    0
dtype: int64

In [31]:
# train test split
train, test = train_test_split(data, test_size = 0.2, stratify = data['sentiment'], random_state=21)

# get the shape of train and test split.
train.shape, test.shape

((53836, 2), (13460, 2))

In [32]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

# fit the object with the training data tweets
tfidf_vectorizer.fit(train.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, sublinear_tf=False,
                token_pa

In [33]:
# transform the train and test data
train_idf = tfidf_vectorizer.transform(train.text)
test_idf  = tfidf_vectorizer.transform(test.text)

In [34]:
# Training Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
NB_model = MultinomialNB()
NB_model.fit(train_idf, train.sentiment)
X_predict_nb = NB_model.predict(train_idf)
y_predict_nb = NB_model.predict(test_idf)

# f1 score on train data
f1_score(y_true= train.sentiment, y_pred= X_predict_nb, average='micro')


f1_score(y_true= test.sentiment, y_pred= y_predict_nb, average='micro')

0.8468796433878157

In [35]:
# create the object of Logistic Regression Model
model_LR = LogisticRegression(multi_class= "multinomial", solver= 'lbfgs')

# fit the model with the training data
model_LR.fit(train_idf, train.sentiment)

# predict the label on the traning data
predict_train = model_LR.predict(train_idf)

# predict the model on the test data
predict_test = model_LR.predict(test_idf)

# f1 score on train data
f1_score(y_true= train.sentiment, y_pred= predict_train, average='micro')


f1_score(y_true= test.sentiment, y_pred= predict_test, average='micro')


0.9032689450222883

In [36]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression())])

# fit the pipeline model with the training data                            
pipeline.fit(train.text, train.sentiment)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),


In [37]:
# sample tweet
text = ["bitcoin is great"]

# predict the label using the pipeline
pipeline.predict(text)

array([0])