In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sentiment140


In [None]:
import pandas as pd

# read the csv file
tweet_data = pd.read_csv(path + "/training.1600000.processed.noemoticon.csv", encoding="latin-1")
tweet_data.head()

In [22]:
import re
# naming the columns
columns_name = ["target", "ids", "date", "flag", "user", "text"]
tweet_data.columns = columns_name

# filter the unnecessary dataset
tweet_data = tweet_data.drop(["ids", "date", "flag", "user"], axis=1)
tweet_data.head()

# remove the @accname mentions from the text column
tweet_data["text"] = tweet_data["text"].str.replace(re.compile("@\w+"), "", regex=True)

# remove the links from the text column
tweet_data["text"] = tweet_data["text"].str.replace(re.compile("http\S+"), "", regex=True)
tweet_data.head()


Unnamed: 0,target,text
0,0,is upset that he can't update his Facebook by ...
1,0,I dived many times for the ball. Managed to s...
2,0,my whole body feels itchy and like its on fire
3,0,"no, it's not behaving at all. i'm mad. why am..."
4,0,not the whole crew


In [34]:
# create the vector dataset to extract features from the tweets

from sklearn.feature_extraction.text import TfidfVectorizer

# define the vectorizer
# test with different amount of features to maximize the feature extraction
vectorizer = TfidfVectorizer(stop_words="english", max_features= 500).fit(tweet_data["text"])

# transform the dataset
X_tfidf = vectorizer.transform(tweet_data["text"])


In [30]:
# using Countervectorizer
from sklearn.feature_extraction.text import CountVectorizer

# define the vectorizer
# test with different amount of features to maximize the feature extraction
vectorizer = CountVectorizer(stop_words="english", max_features= 500).fit(tweet_data["text"])

# transform the dataset
X_count = vectorizer.transform(tweet_data["text"])

In [9]:
# Explore the feature of the tweets and convert into a dataframe
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
X_df.head()


Unnamed: 0,amp,awesome,bad,bed,best,better,bit,com,come,cool,...,way,week,weekend,wish,won,work,working,yay,yeah,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from sklearn.model_selection import train_test_split

y = tweet_data["target"]

# define the labels, and
# casting the dataset into training and testing datasetX
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=123, stratify=y)


In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# fit the training data using logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [38]:
# In this tweets sentiment analysis, the more the features are extracted from text,
# the higher the accuracy will result.
print('Accuracy on trainig data: ', log_reg.score(X_train, y_train))
print('Accuracy on testing data: ', log_reg.score(X_test, y_test))

Accuracy on trainig data:  0.7188779053733636
Accuracy on testing data:  0.719459375


In [39]:
# Accuracy score
y_predicted = log_reg.predict(X_test)
print('Accuracy score: ', accuracy_score(y_test, y_predicted))

Accuracy score:  0.719459375


In [40]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_predicted)/len(y_test))

[[0.33506875 0.16493125]
 [0.11560938 0.38439062]]


In [50]:
# Test with new prompted dataset
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

new_string = "Life a journey full of joys, and suffering. We become what we pondering on.But this is not the reality, but a reality that we created for oursevle."

new_string2 = "I like this girl, but she doesn't like me."

new_string3 = "OMG, this is scary!"

new_string_vector = vectorizer.transform([new_string3])
predicted_target = log_reg.predict(new_string_vector)[0]

if predicted_target == 4:
    print("Positive")
else:
    print("Negative")
# define the vectorizer
# test with different amount of features to maximize the feature extraction

Negative
