#Import Libraries

In [1]:
!pip install nltk scikit-learn spacy



In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import kagglehub
import os
import re

In [3]:
# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2


In [4]:
# Check downloaded dataset files
print("Dataset files:", os.listdir(path))

Dataset files: ['training.1600000.processed.noemoticon.csv']


In [5]:
# Load dataset (replace 'your_file.csv' with the actual filename)
df = pd.read_csv(os.path.join(path, "training.1600000.processed.noemoticon.csv"), encoding="ISO-8859-1")

# Show first few rows
print(len(df))
df.head()

1599999


Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
column_names = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(os.path.join(path, "training.1600000.processed.noemoticon.csv"), encoding="ISO-8859-1", names=column_names)

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
df = df.sample(frac=1).iloc[0:10000]

In [14]:
df.info()  # Shows data types and missing values

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 544488 to 1125587
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  10000 non-null  int64 
 1   id      10000 non-null  int64 
 2   date    10000 non-null  object
 3   flag    10000 non-null  object
 4   user    10000 non-null  object
 5   text    10000 non-null  object
dtypes: int64(2), object(4)
memory usage: 546.9+ KB


In [15]:
df.describe()  # Statistical summary (for numerical columns)

Unnamed: 0,target,id
count,10000.0,10000.0
mean,2.0416,1994504000.0
std,1.999667,195056800.0
min,0.0,1467824000.0
25%,0.0,1950863000.0
50%,4.0,2001348000.0
75%,4.0,2175884000.0
max,4.0,2329205000.0


In [16]:
df["text"].head()  # Preview some tweets

Unnamed: 0,text
544488,wants to be @babygirlparis ne BFF sorry @Wanni...
131824,I couldn't find my shoes this morning so I had...
1254593,made fried salmon patties with mamaw
939975,@johncmayer check it YouTube mail
266022,Account hacked once again.....


In [17]:
df.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [18]:
df["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
4,5104
0,4896


In [19]:
df = df[["target", "text"]]

In [20]:
df["target"] = df["target"].replace({4: 1})

In [21]:
df["text"] = df["text"].str.lower()
df.head(2)

Unnamed: 0,target,text
544488,0,wants to be @babygirlparis ne bff sorry @wanni...
131824,0,i couldn't find my shoes this morning so i had...


In [22]:
def clean_text(text):
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text.strip()

df["text"] = df["text"].apply(clean_text)
df.head(2)

Unnamed: 0,target,text
544488,0,wants to be ne bff sorry
131824,0,i couldnt find my shoes this morning so i had ...


In [23]:
nltk.download('stopwords')

stop_words = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x : " ".join(word for word in x.split() if word not in stop_words))
df.head(2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,target,text
544488,0,wants ne bff sorry
131824,0,couldnt find shoes morning wear ones wore cree...


In [24]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["target"]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.736
