### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Importing Twitter sentiment dataset

In [22]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('../twitter_data.csv', names=column_names, encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [23]:
twitter_data.isnull().sum()
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [24]:
#convert target "4" to "1" which labels positive
twitter_data.replace( {'target' : {4:1}}, inplace=True)

##### 1 -> Positive Tweet and 0 -> Negative Tweet

In [2]:
def lemmatization(content):
    lemmatizer = WordNetLemmatizer()
    content = re.sub('[^a-zA-Z]', ' ', content)  # Remove non-alphabetic characters
    content = content.lower()  # Convert to lowercase
    tokens = content.split()  # Tokenize
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    return ' '.join(lemmatized_tokens)  # Return as a single string

In [None]:
twitter_data['lemmatized_content'] = twitter_data['text'].apply(lemmatization)

In [3]:
twitter_data = pd.read_csv('../twitter_data_cleaned.csv', encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,lemmatized_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [4]:
twitter_data['lemmatized_content'].head()

0    switchfoot http twitpic com zl awww bummer sho...
1    upset update facebook texting might cry result...
2    kenichan dived many time ball managed save res...
3                      whole body feel itchy like fire
4                     nationwideclass behaving mad see
Name: lemmatized_content, dtype: object

In [5]:
twitter_data['target'].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

### Splitting of data to train and test

In [6]:
x = twitter_data['lemmatized_content']
y = twitter_data['target'].astype(str)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print(x_train.shape, x_test.shape)

(1280000,) (320000,)


In [7]:
x_train = x_train.fillna("")
x_test = x_test.fillna("")

### converting textual data into numerical data

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # unigrams + bigrams

In [9]:
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [14]:
print(x_train)

  (0, 275637)	0.3927239016616036
  (0, 1229415)	0.433267698555852
  (0, 276820)	0.8112009852384319
  (1, 2235677)	0.4799570481157655
  (1, 4119394)	0.3969282533630161
  (1, 2237000)	0.7823612935506116
  (2, 2168873)	0.3424673319539078
  (2, 380331)	0.12338417562718157
  (2, 1208063)	0.1515058744330477
  (2, 876769)	0.08934486250939139
  (2, 3887479)	0.16060178514869247
  (2, 1363226)	0.12032835002868905
  (2, 1406912)	0.09421667279884167
  (2, 2532003)	0.2622132556163107
  (2, 3819600)	0.09930498150054952
  (2, 2168874)	0.3424673319539078
  (2, 381196)	0.2662321065635793
  (2, 1208370)	0.2832435814927878
  (2, 885069)	0.25807426575254633
  (2, 3888023)	0.2853173576157468
  (2, 1364533)	0.2536714257382264
  (2, 1411948)	0.3328037237111846
  (2, 2532040)	0.3424673319539078
  (3, 1926786)	0.3273093227912266
  (3, 2802167)	0.24038336886137307
  :	:
  (1279998, 2476203)	0.18415284440182522
  (1279998, 827736)	0.28014484906908
  (1279998, 2496698)	0.27854731813361056
  (1279998, 3238959)	0.2

In [15]:
print(x_test)

  (0, 45273)	0.20307191054987783
  (0, 475214)	0.25997186396788213
  (0, 475217)	0.277678768519101
  (0, 1463739)	0.08194681420233126
  (0, 1470926)	0.277678768519101
  (0, 1775117)	0.08695081137489943
  (0, 1777141)	0.14207729165174346
  (0, 2020279)	0.08543988037271541
  (0, 2025668)	0.19642498269668185
  (0, 2477816)	0.212943385015019
  (0, 2477855)	0.277678768519101
  (0, 2546517)	0.277678768519101
  (0, 2694482)	0.09112639958403607
  (0, 2696451)	0.19386258388769936
  (0, 2842059)	0.22668017755320455
  (0, 2879160)	0.1235048251116298
  (0, 2879183)	0.277678768519101
  (0, 3060093)	0.12370128555320235
  (0, 3061260)	0.25997186396788213
  (0, 3420883)	0.14233013166833886
  (0, 3421575)	0.25997186396788213
  (0, 3591924)	0.11033164682742876
  (0, 3872378)	0.1342852193320045
  (0, 4007980)	0.23048151514540383
  (0, 4228669)	0.11566166631021106
  :	:
  (319997, 2438094)	0.5085616169521757
  (319997, 2539215)	0.3011878170551462
  (319998, 1406912)	0.17297965615245975
  (319998, 1785198)

### Training the Machine Learning model

### 1. Logistic Regression and Evaluation 


In [28]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(x_train, y_train)
print("LR train accuracy:", accuracy_score(y_train, lr_model.predict(x_train)))
print("LR test accuracy:", accuracy_score(y_test, lr_model.predict(x_test)))

LR train accuracy: 0.8528984375
LR test accuracy: 0.793021875


### Saving the trained model

In [20]:
import pickle

In [21]:
filename = "trained_model.sav"
pickle.dump(model, open(filename, 'wb'))

### using the saved model for future predictions

In [22]:
#loading the saving model
loaded_model = pickle.load(open('../Twitter-Sentimental-Analysis/trained_model.sav', 'rb')) 

In [23]:
x_new = x_test[500]  # Works with csr_matrix
print("actual value : ", y_test.iloc[500])  # pandas Series

y_new = loaded_model.predict(x_new)

if y_new[0] == 0:
    print("Negative Tweet")
else:
    print("Positive Tweet")


actual value :  4
Positive Tweet


In [24]:
def predict_sentiment_from_tweet(tweet, vectorizer, model):
    tweet_vec = vectorizer.transform([tweet])  # use transform only
    pred = model.predict(tweet_vec)
    print("Predicted sentiment is:", "Positive" if pred[0] == 1 else "Negative")

In [25]:
tweet = "For those who are interested, I have spoken for the first time in nine years on this podcast.I want to say sorry to employees of Kingfisher Airlines and also to set the record straight with facts and the truth."
predict_sentiment_from_tweet(tweet, vectorizer, loaded_model)

Predicted sentiment is: Negative


### 2. Naive Bayesian model and Evaluation

In [27]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

nb_test_accuracy = accuracy_score(y_test, nb_model.predict(x_test))
print("NB test accuracy:", nb_test_accuracy)

NB test accuracy: 0.77956875


In [29]:
!git add .
!git commit -m "Save local changes before pulling"
!git pull origin main




[main 3ea1135] add Naive Bayesian model and Evaluation on twitter sentiment analysis
 2 files changed, 102 insertions(+), 970 deletions(-)
branch 'main' set up to track 'origin/main'.


To https://github.com/RevanasiddaNK/Twitter-Sentimental-Analysis.git
   7c0cd05..3ea1135  main -> main
