**Installing Kaggle library**

In [1]:
!pip install kaggle



**Configure the path of kaggle.json file**

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle_Twitter.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle_Twitter.json

In [3]:
#API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 98% 79.0M/80.9M [00:02<00:00, 40.1MB/s]
100% 80.9M/80.9M [00:02<00:00, 29.5MB/s]


In [4]:
#Extracting the compressed dataset
from zipfile import ZipFile
file_name = "/content/sentiment140.zip"

with ZipFile(file_name, 'r') as zip:   #Here 'r' stands for 'read'
  zip.extractall()
  print("Extracted")

Extracted


**Importing the dependencies**

In [5]:
import numpy as np
import pandas as pd
import re      #'re' is used for pattern matching
from nltk.corpus import stopwords   #Stopwords are the words that doesn't add any meaning to our textual data
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer    #Here tfid is used to convert the textual data into numerical data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
#Print the stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

**Data Processing**

In [9]:
#Loading the data from csv file to pandas df
twitter_sent = pd.read_csv('/content/Twitter.csv', encoding = 'ISO-8859-1')    #Here 'twitter_sent' is my dataframe

In [10]:
#Checking the number of rows and cols
twitter_sent.shape

(1599999, 6)

In [11]:
#Print the first 5 rows
twitter_sent.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [19]:
#Naming the cols and reading the dataset again
twitter_sent.columns = ['target', 'id', 'date', 'flag', 'user', 'tweet']
twitter_sent = pd.read_csv('/content/Twitter.csv', names = twitter_sent.columns, encoding = 'ISO-8859-1')

In [12]:
twitter_sent.shape

(1599999, 6)

In [16]:
twitter_sent.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [20]:
#Check for null values
twitter_sent.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
tweet     0
dtype: int64

In [21]:
#Checking the distribution of target columns
twitter_sent['target'].value_counts()   #by this we can see the data is equally distibuted - '0' for negative and '4' for positive

target
0    800000
4    800000
Name: count, dtype: int64

**Replacing the target '4' to '1'**

In [22]:
#Here inplace - true states that the change has to be stored in the dataset
twitter_sent.replace(4, 1, inplace = True)

In [23]:
twitter_sent['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

0 --> for negative tweet

1 --> for positive tweet

**Stemming** - **It is a process of reducing a word to its root word**

Example - Actor, Actress, Acting - Act

In [24]:
port_stem = PorterStemmer()

In [25]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]', ' ', content)  #Here ^a to z represent the upar and lower case letters
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content


In [28]:
twitter_sent['stemmed_content'] = twitter_sent['tweet'].apply(stemming)

In [29]:
twitter_sent.head()

Unnamed: 0,target,id,date,flag,user,tweet,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [30]:
print(twitter_sent['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [31]:
print(twitter_sent['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [32]:
#Separating the data and labels
X = twitter_sent['stemmed_content'].values
Y = twitter_sent['target'].values

In [33]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [34]:
print(Y)

[0 0 0 ... 1 1 1]


**Splitting the data into train and test data**

In [35]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 2)

In [36]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [37]:
print(X_train)

['watch saw iv drink lil wine' 'hatermagazin'
 'even though favourit drink think vodka coke wipe mind time think im gonna find new drink'
 ... 'eager monday afternoon'
 'hope everyon mother great day wait hear guy store tomorrow'
 'love wake folger bad voic deeper']


In [38]:
print(X_test)

['mmangen fine much time chat twitter hubbi back summer amp tend domin free time'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'ishatara mayb bay area thang dammit' ...
 'destini nevertheless hooray member wonder safe trip' 'feel well'
 'supersandro thank']


In [39]:
#Converting the textual data to numerical data - To do this we'll use vectorizer
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [40]:
print(X_train)

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

**Training the machine learning model**

Logistic Regression

In [42]:
#Here the max_iter states that maximum no. of time the model should go through the data
model = LogisticRegression(max_iter = 1000)

In [43]:
model.fit(X_train, Y_train)  #Here X contributes for positive tweets and Y for negative

**Model Evaluation**

Accuracy score

In [44]:
#Accuracy score of the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [45]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.81018984375


In [46]:
#Accuracy score of the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [47]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7780375


Model Accuracy = 77.8%

**Now, saving the trained model**

In [48]:
import pickle

In [50]:
file_name = 'trained_model.save'    #dump is used to store all the parameters and then train the model
pickle.dump(model, open(file_name, 'wb'))  #wb stands for writebytes, 'b' stands for binary format

**Using the saved model for future predictions**

In [51]:
#Loading the saved model
loaded_model = pickle.load(open('/content/trained_model.save', 'rb'))

In [55]:
X_new = X_test[200]  #This 200 is nothing but the 200th data point from 320000
print(Y_test[200])

prediction = loaded_model.predict(X_new)
print(prediction)

if (prediction[0] == 0):
  print('The tweet is negative')
else:
  print('The tweet is positive')


1
[1]
The tweet is positive


Hence we have successfully predicted the sentiments behind the tweets