In [2]:
pip install kaggle




In [4]:
# configure path of kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [5]:
# API To fetch data set from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 68% 55.0M/80.9M [00:00<00:00, 150MB/s]
100% 80.9M/80.9M [00:00<00:00, 149MB/s]


In [7]:
# Extract the compressed dataset
from zipfile import ZipFile
file_name = "/content/sentiment140.zip"
with ZipFile('/content/sentiment140.zip' , 'r') as zip:
  zip.extractall()
  print("Data is Extracted")

Data is Extracted


Import Dependencies

In [8]:
import numpy as np
import pandas as pd
import sklearn
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Preprocessing


In [11]:
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv' , encoding = 'ISO-8859-1')
data

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [12]:
data.shape

(1599999, 6)

In [13]:
# First five rows of the data
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [14]:
column_names = ['Target','id' , 'Date' , 'Flag' , 'User' , 'Text']
data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv' , encoding = 'ISO_8859-1' , names = column_names )
data

Unnamed: 0,Target,id,Date,Flag,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [15]:
# counting null values
data.isnull().sum()

Target    0
id        0
Date      0
Flag      0
User      0
Text      0
dtype: int64

In [16]:
# Distribution of the Target column
data['Target'].value_counts()

Target
0    800000
4    800000
Name: count, dtype: int64

In [17]:
# convert Target 4 - 1
data.replace({'Target':{4:1}}, inplace = True)

In [18]:
data['Target'].value_counts()

Target
0    800000
1    800000
Name: count, dtype: int64

**Stemming**

Stemming is a process for reducing word to its root word


In [19]:
port_stem = PorterStemmer()

In [20]:
def stemming(contents):
    stemmed_contents = re.sub('[^a-zA-Z]', ' ' ,contents)
    stemmed_contents = stemmed_contents.lower()
    stemmed_contents = stemmed_contents.split()
    stemmed_contents = [port_stem.stem(words) for words in stemmed_contents if not words in stopwords.words('english')]
    stemmed_contents = ' '.join(stemmed_contents)
    return stemmed_contents

In [21]:
data["Stemmed_content"] = data['Text'].apply(stemming)

In [23]:
print(data['Stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: Stemmed_content, Length: 1600000, dtype: object


In [28]:
data['Target'].value_counts()

Target
0    800000
1    800000
Name: count, dtype: int64

In [34]:
# Separating Data and Label
X = data['Stemmed_content'].values
y = data['Target'].values

In [33]:
X

array(['switchfoot http twitpic com zl awww bummer shoulda got david carr third day',
       'upset updat facebook text might cri result school today also blah',
       'kenichan dive mani time ball manag save rest go bound', ...,
       'readi mojo makeov ask detail',
       'happi th birthday boo alll time tupac amaru shakur',
       'happi charitytuesday thenspcc sparkschar speakinguph h'],
      dtype=object)

**Spliting Data into Train Test Split

In [35]:
X_train , X_test , y_train , y_test = train_test_split(X ,y ,test_size = 0.2 , stratify=y , random_state = 2)

In [36]:
print(X_train.shape , X_test.shape)

(1280000,) (320000,)


Convert text data into numerical Data

In [37]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

Training Machine Learning Model

In [38]:
model = LogisticRegression(max_iter = 1000)

In [39]:
model.fit(X_train , y_train)

In [40]:
y_pred = model.predict(X_train)
score = accuracy_score(y_pred , y_train)
print(score)

0.81018984375


Testing accuracy

In [41]:
x_test = model.predict(X_test)
score = accuracy_score(x_test , y_test)
print(score)

0.7780375


**Saving Model**

In [42]:
import pickle
pickle.dump(model , open('model.pkl' , 'wb'))

**Loading Model**

In [43]:
loaded_model  = pickle.load(open('model.pkl' , 'rb'))

**Prediction

In [47]:
X_new = X_test[344]
print(y_test[344])

prediction = model.predict(X_new)
print(prediction)

if prediction == 1:
  print("Positive")
else:
  print("Negative")

0
[0]
Negative
