In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [None]:
column_names = ["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"]

# Load dataset
df_train = pd.read_csv("twitter_training.csv", names=column_names, header=None)
df_val = pd.read_csv("twitter_validation.csv", names=column_names, header=None)

# Cek isi dataset
print(df_train.head())


   Tweet_ID       Entity Sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                       Tweet_Content  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [None]:
print(df_train.columns)  # Cek semua nama kolom yang ada di dataset


Index(['Tweet_ID', 'Entity', 'Sentiment', 'Tweet_Content'], dtype='object')


In [None]:
nltk.download("punkt_tab")  # This line is added to download the necessary data


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("stopwords")
nltk.download("punkt")

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text).lower()  # Konversi ke huruf kecil
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Hapus URL
    text = re.sub(r'\d+', '', text)  # Hapus angka
    text = re.sub(r'[^\w\s]', '', text)  # Hapus tanda baca
    words = word_tokenize(text)  # Tokenisasi
    words = [word for word in words if word not in stop_words]  # Hapus stopwords
    return " ".join(words)

# Terapkan ke dataset
df_train["cleaned_tweet"] = df_train["Tweet_Content"].apply(clean_text)
df_val["cleaned_tweet"] = df_val["Tweet_Content"].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df_train["sentiment_encoded"] = label_encoder.fit_transform(df_train["Sentiment"])
df_val["sentiment_encoded"] = label_encoder.transform(df_val["Sentiment"])

# Cek hasil encoding
print(label_encoder.classes_)  # Lihat urutan label yang dikonversi


['Irrelevant' 'Negative' 'Neutral' 'Positive']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit-transform pada training set, transform pada validation set
X_train = tfidf_vectorizer.fit_transform(df_train["cleaned_tweet"]).toarray()
X_val = tfidf_vectorizer.transform(df_val["cleaned_tweet"]).toarray()

# Label
y_train = df_train["sentiment_encoded"]
y_val = df_val["sentiment_encoded"]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model multi-class
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
model.fit(X_train, y_train)

# Prediksi
y_pred = model.predict(X_val)

# Evaluasi
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred, target_names=label_encoder.classes_))




Accuracy: 0.802
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.81      0.72      0.76       172
    Negative       0.77      0.86      0.81       266
     Neutral       0.83      0.75      0.79       285
    Positive       0.81      0.86      0.83       277

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.80      0.80      0.80      1000



In [None]:
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    vectorized_text = tfidf_vectorizer.transform([cleaned_text]).toarray()
    prediction = model.predict(vectorized_text)
    return label_encoder.inverse_transform(prediction)[0]  # Konversi kembali ke label aslinya

new_tweet = "I love playing Borderlands, it's amazing!"
print(predict_sentiment(new_tweet))


Positive
