In [1]:
#Installing Kaggle Library
!pip install kaggle



In [2]:
import os
import shutil

In [3]:
# Create the .kaggle directory if it doesn't exist
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

In [4]:
# Copy the kaggle.json file into the .kaggle directory
shutil.copy("kaggle.json", os.path.expanduser("~/.kaggle/kaggle.json"))

'C:\\Users\\Aryan Kumar/.kaggle/kaggle.json'

In [5]:
# Set file permissions (skipped or adjusted for Windows)
kaggle_path = os.path.expanduser("~/.kaggle/kaggle.json")
if os.name != 'nt':  # If not Windows
    os.chmod(kaggle_path, 0o600)

### Importing Twitter Sentimental Analysis

In [6]:
# API to fetch the dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
#extracting the zip archive
from zipfile import ZipFile
with ZipFile('sentiment140.zip', 'r') as zip_ref:
    zip_ref.extractall('sentiment140')
    print("Dataset extracted successfully!")

Dataset extracted successfully!


#### Importing The Dependencies

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aryan
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
#printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### About Datasets

This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

#### Content

It contains the following 6 fields:-
1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
2. ids: The id of the tweet ( 2087)
3. date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
4. flag: The query (lyx). If there is no query, then this value is NO_QUERY.
5. user: the user that tweeted (robotickilldozr)
6. text: the text of the tweet (Lyx is cool)

#### Data Processing

In [11]:
#Loading the CSV Data
twitter_data = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [12]:
twitter_data.shape

(1599999, 6)

In [13]:
twitter_data.head(5)

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [14]:
#Naming the column and Reading the data
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=column_names)

In [15]:
twitter_data.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [16]:
#counting the number of missing values in the dataset
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [17]:
#checking the distribution of the target column
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

#### Convert the target '4' to '1'

In [18]:
twitter_data.replace({'target': {4:1}}, inplace=True)

In [19]:
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

0 ----> Negative <br>
1 ----> Positive

**Stemming**

Steeming is the process of producing a word to its Root Word <br>
Example: Actor, Actress, Acting = Act

In [20]:
port_stem = PorterStemmer()

In [21]:
def stemming(content):
    stemmed_content = re.sub(r'@[A-Za-z0-9]+', '', content) #Removing @mentions
    stemmed_content = re.sub(r'#', '', stemmed_content) #Removing the '#' symbol
    stemmed_content = re.sub(r'RT[\s]+', '', stemmed_content) #Removing RT
    stemmed_content = re.sub(r'https?:\/\/\S+', '', stemmed_content) #Removing hyperlinks
    stemmed_content = re.sub(r'[^a-zA-Z]', ' ', stemmed_content) #Removing special characters
    stemmed_content = re.sub(r'\s+', ' ', stemmed_content, flags=re.I|re.A) #Removing extra spaces
    stemmed_content = stemmed_content.lower() #Converting to lowercase
    stemmed_content = stemmed_content.split() #Splitting the words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')] #Stemming the words and removing stopwords
    stemmed_content = ' '.join(stemmed_content) #Joining the words back together
    return stemmed_content

In [22]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [23]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,dive mani time ball manag save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",behav mad see


In [24]:
print(twitter_data['stemmed_content'])

0               awww bummer shoulda got david carr third day
1          upset updat facebook text might cri result sch...
2               dive mani time ball manag save rest go bound
3                            whole bodi feel itchi like fire
4                                              behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996              thewdb com cool hear old walt interview
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999                                 happi charitytuesday
Name: stemmed_content, Length: 1600000, dtype: object


In [25]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [26]:
# Seperating the data and labels
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [27]:
print(X)

['awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday']


In [28]:
print(Y)

[0 0 0 ... 1 1 1]


#### Splitting the data to training data and test data

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [30]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [31]:
print(X_train)

['lol get idea far advanc even june yet need third knitter summer group'
 'worst headach ever'
 'sad wont see miss alreadi yeah perfect come back th' ...
 'got home meet talk endlessli one coolest guy ever met smile'
 'bought chocol bar quot win free bar quot label win either'
 'said hope dm email sunday']


In [32]:
print(X_test)

['denali ye black red fav color realli want color def look awesom jare'
 'qu buy open hous weekend pm best valu one bedroom lic long island citi bd'
 'fran greet air okay hahahaha thank' ...
 'brat follow also hope atleast get also wish get well soon'
 'feel like decent swell sinc last fall hope wave myrtl beach week either least golf'
 'relaxin busi day']


In [33]:
# Converting the textual data into numerical data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train) # Fit the vectorizer to the training data
X_test = vectorizer.transform(X_test) # Transform the test data using the fitted vectorizer

In [34]:
print(X_train)

  (0, 98792)	0.17051904120168906
  (0, 64373)	0.14572332638353142
  (0, 79248)	0.25255530889446465
  (0, 55266)	0.24928856075186495
  (0, 2298)	0.35784345197301765
  (0, 52847)	0.20645081729393253
  (0, 87763)	0.27221441240390787
  (0, 193205)	0.22730848915330898
  (0, 115754)	0.1817112572518178
  (0, 169113)	0.3359288166937043
  (0, 92019)	0.4847719750842347
  (0, 162212)	0.23172774423979603
  (0, 68678)	0.30454313380077225
  (1, 189972)	0.6355203097899969
  (1, 72560)	0.5772696016170398
  (1, 52899)	0.5127121442811071
  (2, 144466)	0.28865099313024034
  (2, 189149)	0.39379178215195865
  (2, 147941)	0.257574592184754
  (2, 109028)	0.2513899662426339
  (2, 5183)	0.32846109493604425
  (2, 192497)	0.30474098003303646
  (2, 128059)	0.40965307752304914
  (2, 33844)	0.2731578803992047
  (2, 12493)	0.2513964453493461
  :	:
  (1279996, 122088)	0.5690473360697704
  (1279997, 52899)	0.254529955326331
  (1279997, 122365)	0.19083841056987663
  (1279997, 75875)	0.2049080288342123
  (1279997, 69645

In [35]:
print(X_test)

  (0, 11335)	0.18778881995740343
  (0, 18701)	0.2491836696613244
  (0, 33717)	0.5448343835421826
  (0, 41178)	0.27043912170668244
  (0, 41838)	0.4266094571203387
  (0, 55653)	0.27080471599192785
  (0, 84771)	0.33461198682171117
  (0, 99190)	0.16197101104067438
  (0, 138425)	0.1553670848009902
  (0, 138948)	0.2391304321290182
  (0, 184146)	0.1497084362268776
  (0, 192436)	0.18813694505942702
  (1, 14854)	0.3505018561509464
  (1, 15465)	0.29381567475414516
  (1, 16764)	0.18155441508152442
  (1, 25219)	0.20497670900909926
  (1, 31894)	0.22768170303561294
  (1, 77037)	0.18863062654576518
  (1, 83249)	0.26461200519637645
  (1, 96896)	0.41807723485544074
  (1, 99034)	0.18019245070530565
  (1, 122365)	0.14225874519215773
  (1, 123127)	0.2148125605657706
  (1, 131181)	0.21827114979782927
  (1, 136126)	0.3344477921517163
  :	:
  (319997, 22208)	0.4628057952848841
  (319997, 58917)	0.2277152267678488
  (319997, 64373)	0.32871822177455334
  (319997, 76474)	0.20307502633665492
  (319997, 156391)	0

#### Training the Machine Learning Model

##### Logistic Regression

In [36]:
model = LogisticRegression(max_iter=1000)

In [37]:
model.fit(X_train, Y_train)

#### Model Evaluation

#### Accuracy Score

In [38]:
# Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [39]:
print("Accuracy score on the training data: ", training_data_accuracy)

Accuracy score on the training data:  0.783046875


In [40]:
# Accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [41]:
print("Accuracy score on the test data: ", test_data_accuracy)

Accuracy score on the test data:  0.773665625


##### Saving The Model

In [42]:
import joblib

# Dump both your fitted vectorizer and model to disk
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(model,      'sentiment_model.joblib')

print("✅ Saved vectorizer → tfidf_vectorizer.joblib")
print("✅ Saved model      → sentiment_model.joblib")

✅ Saved vectorizer → tfidf_vectorizer.joblib
✅ Saved model      → sentiment_model.joblib


In [43]:
# Cell: Preprocessing Helper
# (Re-use your existing stemmer & regex logic)

from nltk.stem.porter import PorterStemmer
import re

stemmer = PorterStemmer()

def preprocess_and_stem(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+|@\w+|[^a-zA-Z]", " ", text)
    tokens = text.split()
    # (If you removed stopwords earlier, apply that here too)
    return " ".join(stemmer.stem(tok) for tok in tokens)


In [44]:
# Cell: Load & Predict
import joblib

# 1) Load saved artifacts
vec = joblib.load('tfidf_vectorizer.joblib')
mdl = joblib.load('sentiment_model.joblib')
print("✅ Loaded vectorizer & model")

# 2) Wrap your predict function
def predict_sentiment(text: str) -> str:
    proc = preprocess_and_stem(text)
    feats = vec.transform([proc])
    p = mdl.predict(feats)[0]
    return "Positive" if p == 1 else "Negative"

# 3) Quick sanity check
for sample in [
    "I had an amazing day!",
    "This was the worst experience ever."
]:
    print(f"{sample!r} → {predict_sentiment(sample)}")


✅ Loaded vectorizer & model
'I had an amazing day!' → Positive
'This was the worst experience ever.' → Negative


##### Checking Positive or Negative Tweet

In [45]:
if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1:
        # Pass the tweet text as command-line arguments:
        tweet = " ".join(sys.argv[1:])
    else:
        # Or be prompted to type one
        tweet = input("Enter tweet text: ")

    print(f"Tweet → {predict_sentiment(tweet)}")

Tweet → Positive
