In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import joblib

In [2]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [3]:
stop_words = set(stopwords.words("english"))

In [4]:
dataset_path = "data.csv"  # Update with your file path
df = pd.read_csv(dataset_path, encoding='latin-1', header=None)
df.columns = ["target", "ids", "date", "flag", "user", "text"]

In [5]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df.shape

(1600000, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [30]:
df['text'].unique()

array(["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
       "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
       '@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
       ..., 'Are you ready for your MoJo Makeover? Ask me for details ',
       'Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur ',
       'happy #charitytuesday @theNSPCC @SparksCharity @SpeakingUpH4H '],
      shape=(1581466,), dtype=object)

In [8]:
def preprocess_text(text):
    # Remove special characters, URLs, and mentions/hashtags
    text = re.sub(r"(http\S+|www\S+|https\S+|\@\w+|\#)", "", text)
    text = text.lower()  # Convert to lowercase
    # Tokenize and remove stopwords and non-alphanumeric tokens
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

In [9]:
import time
start_time = time.time()
df["cleaned_text"] = df["text"].apply(preprocess_text)
print(f"Time taken for preprocessing: {time.time() - start_time} seconds")

Time taken for preprocessing: 121.20112109184265 seconds


In [10]:
X = df["cleaned_text"]
y = df["target"].replace({4: 1})  # Convert target to binary: 1 (positive), 0 (negative)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
model = LogisticRegression()  
model.fit(X_train_tfidf, y_train)

In [13]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.77105625
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [14]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.77105625
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [19]:
loaded_model = joblib.load("sentiment_model.pkl")
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

In [27]:
def predict_sentiment(text):
    # Clean and transform text
    cleaned_text = preprocess_text(text)
    text_tfidf = loaded_vectorizer.transform([cleaned_text])
    # Predict sentiment
    prediction = loaded_model.predict(text_tfidf)
    return "Positive" if prediction[0] == 1 else "Negative"

In [32]:
sample_text = "my whole body feels itchy and like its on fire"
print(f"Sentiment: {predict_sentiment(sample_text)}")

Sentiment: Negative
