In [3]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.13-py3-none-any.whl.metadata (38 kB)
Collecting tqdm (from kagglehub)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading kagglehub-0.3.13-py3-none-any.whl (68 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, kagglehub

   ---------------------------------------- 0/2 [tqdm]
   ---------------------------------------- 0/2 [tqdm]
   ---------------------------------------- 0/2 [tqdm]
   ---------------------------------------- 0/2 [tqdm]
   ---------------------------------------- 0/2 [tqdm]
   -------------------- ------------------- 1/2 [kagglehub]
   -------------------- ------------------- 1/2 [kagglehub]
   -------------------- ------------------- 1/2 [kagglehub]
   -------------------- ------------------- 1/2 [kagglehub]
   ---------------------------------------- 2/2 [kagglehub]

Successfully installed kagglehub-0.3.13 tqdm-4.67.1


# Import

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import save_npz
import seaborn as sns
import kagglehub
import os
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split

# Dataset Download

In [5]:
# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/kazanova/sentiment140?dataset_version_number=2...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 80.9M/80.9M [00:14<00:00, 5.98MB/s]

Extracting files...





Path to dataset files: C:\Users\dernj\.cache\kagglehub\datasets\kazanova\sentiment140\versions\2


In [19]:
os.listdir(path)
df = pd.read_csv(path +  '/' + os.listdir(path)[0], encoding = 'latin-1', names = ['target','ids','date','flag','user','text'] )

About Dataset
# Context
This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

# Content
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

# Acknowledgements
The official link regarding the dataset with resources about how it was generated is here
The official paper detailing the approach is here

Citation: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.

In [20]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [24]:
target = df['target']
target.head()
target.value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [25]:
data = df['text']
data.head()

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

# Text Vectorization

In [30]:
# Split the dataset to prevent data leakage 
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state = 0, test_size = 0.3)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, random_state = 0, test_size = 0.3)

In [31]:
#Setup the vectorizer for the dataset
bin_vec = CountVectorizer(binary = True)
freq_vec = CountVectorizer(binary = False)
tfidf_vec = TfidfVectorizer(max_features = 40000)
hash_vec = HashingVectorizer()

#CountVectorizer with binary = True
data_bin_train = bin_vec.fit_transform(X_train)
data_bin_test = bin_vec.transform(X_test)
data_bin_val = bin_vec.transform(X_val)

#CountVectorizer with binary = False
data_freq_train = freq_vec.fit_transform(X_train)
data_freq_test = freq_vec.transform(X_test)
data_freq_val = freq_vec.transform(X_val)

#TfidfVectorizer truncated
data_tfidf_train = tfidf_vec.fit_transform(X_train)
data_tfidf_test = tfidf_vec.transform(X_test)
data_tfidf_val = tfidf_vec.transform(X_val)

#HashingVectorizer
data_hash_train = hash_vec.fit_transform(X_train)
data_hash_test = hash_vec.transform(X_test)
data_hash_val = hash_vec.transform(X_val)


# Save the vectorized dataset

In [36]:
# Saving the vectorized dataset
vectorizers = {
    'bin_vec': bin_vec,
    'freq_vec': freq_vec,
    'tfidf_vec': tfidf_vec,
    'hash_vec': hash_vec
}

with open('vectorizers.pkl', 'wb') as f:
    pickle.dump(vectorizers, f)

# Binary
save_npz('data_bin_train.npz', data_bin_train)
save_npz('data_bin_test.npz', data_bin_test)
save_npz('data_bin_val.npz', data_bin_val)

# Frequency
save_npz('data_freq_train.npz', data_freq_train)
save_npz('data_freq_test.npz', data_freq_test)
save_npz('data_freq_val.npz', data_freq_val)

# TF-IDF
save_npz('data_tfidf_train.npz', data_tfidf_train)
save_npz('data_tfidf_test.npz', data_tfidf_test)
save_npz('data_tfidf_val.npz', data_tfidf_val)

# Hashing
save_npz('data_hash_train.npz', data_hash_train)
save_npz('data_hash_test.npz', data_hash_test)
save_npz('data_hash_val.npz', data_hash_val)