## Penjelasan Dataset yang digunakan

In [4]:
## Berikan penjelasan terkait data apa yang digunakan diantaranya:
# 1. Kasus yang diambil
# 2. Penjelasan setiap kolomnya
# 3. Import library
# 4. Load data

In [7]:
# =====================================================
# 1. Kasus yang diambil
# =====================================================

# Dataset ini berisi kumpulan tweet yang diposting selama masa pandemi COVID-19.
# Tujuannya adalah untuk menganalisis sentimen masyarakat terhadap COVID-19
# berdasarkan teks tweet mereka.
# Setiap tweet memiliki label sentimen:
#    - Extremely Positive
#    - Positive
#    - Neutral
#    - Negative
#    - Extremely Negative
#
# Analisis ini dapat digunakan untuk memahami opini publik, penyebaran emosi,
# serta persepsi masyarakat terhadap pandemi atau kebijakan pemerintah.


# =====================================================
# 2. Penjelasan setiap kolomnya
# =====================================================

# Dataset ini biasanya disimpan dalam file bernama:
#     Corona_NLP_train.csv  (data latih)
#     Corona_NLP_test.csv   (data uji)
#
# Berikut penjelasan kolom-kolomnya:

# 1. UserName          ‚Üí Nama pengguna Twitter
# 2. ScreenName        ‚Üí Nama tampilan pengguna (username unik di Twitter)
# 3. Location          ‚Üí Lokasi pengguna (jika tersedia)
# 4. TweetAt           ‚Üí Tanggal tweet dibuat
# 5. OriginalTweet     ‚Üí Isi teks asli dari tweet (data utama untuk analisis)
# 6. Sentiment         ‚Üí Label sentimen dari tweet
#                         (Extremely Negative, Negative, Neutral, Positive, Extremely Positive)


# =====================================================
# 3. Import Library
# =====================================================

# Kita perlu mengimpor beberapa library Python untuk membaca dan mengolah data.

import pandas as pd          # Untuk memuat dan memanipulasi data tabular (CSV)
import numpy as np           # Untuk operasi numerik
import matplotlib.pyplot as plt  # Untuk visualisasi data
import seaborn as sns        # Untuk visualisasi data yang lebih menarik
import re                    # Untuk preprocessing teks (membersihkan karakter khusus)
import nltk                  # Untuk analisis teks (tokenisasi, stopwords, dsb)
from sklearn.model_selection import train_test_split  # Untuk membagi data train/test
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # Untuk representasi teks
from sklearn.linear_model import LogisticRegression   # Contoh algoritma klasifikasi
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# =====================================================
# 4. Load Data
# =====================================================

# Pastikan file 'Corona_NLP_train.csv' sudah diunduh dari Kaggle dan diletakkan di folder kerja.

# Membaca dataset menggunakan pandas
data = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')

# Melihat 5 baris pertama data
print(data.head())

# Melihat informasi umum dataset
print(data.info())

# Menampilkan jumlah data per kategori sentimen
print(data['Sentiment'].value_counts())



FileNotFoundError: [Errno 2] No such file or directory: 'Corona_NLP_train.csv'

## Data Understanding

In [None]:
# 1. Jumlah baris data
# 2. Panjang rata-rata setiap baris
# 3. Cek data duplikasi
# 4. Cek data kosong
# 5. Distribusi data menggunakan bar chart, line chart atau word cloud, seperti kata yang sering muncul.

In [None]:
# =====================================================
# Analisis Awal Dataset Corona Tweets NLP - COVID19 Sentiment
# =====================================================

# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# =====================================================
# 1. Load data
# =====================================================

# Pastikan file CSV sudah ada di direktori kerja kamu
data = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')

# Melihat 5 baris pertama
print(data.head())

# =====================================================
# 2. Jumlah baris data
# =====================================================

# Menampilkan jumlah baris dan kolom (shape)
print("# Jumlah baris dan kolom:", data.shape)
# Output: (41157, 6) ‚Üí berarti ada 41.157 baris dan 6 kolom

# =====================================================
# 3. Panjang rata-rata setiap baris (berdasarkan jumlah karakter di kolom tweet)
# =====================================================

# Membuat kolom baru berisi panjang teks setiap tweet
data['tweet_length'] = data['OriginalTweet'].astype(str).apply(len)

# Menghitung rata-rata panjang tweet
avg_length = data['tweet_length'].mean()
print("# Panjang rata-rata tweet:", avg_length)

# =====================================================
# 4. Cek data duplikasi
# =====================================================

# Mengecek jumlah tweet duplikat
duplicate_count = data.duplicated().sum()
print("# Jumlah data duplikat:", duplicate_count)

# Jika ingin menghapus duplikat, bisa gunakan:
# data = data.drop_duplicates()

# =====================================================
# 5. Cek data kosong (missing values)
# =====================================================

print("# Jumlah data kosong per kolom:")
print(data.isnull().sum())

# =====================================================
# 6. Distribusi data sentimen (Visualisasi Bar Chart)
# =====================================================

plt.figure(figsize=(8,5))
sns.countplot(x='Sentiment', data=data, order=data['Sentiment'].value_counts().index, palette='viridis')
plt.title('Distribusi Sentimen Tweet COVID-19')
plt.xlabel('Kategori Sentimen')
plt.ylabel('Jumlah Tweet')
plt.xticks(rotation=15)
plt.show()

# =====================================================
# 7. Distribusi jumlah tweet dari waktu ke waktu (Line Chart)
# =====================================================

# Ubah kolom tanggal menjadi tipe datetime
data['TweetAt'] = pd.to_datetime(data['TweetAt'], errors='coerce')

# Hitung jumlah tweet per tanggal
tweet_per_day = data.groupby('TweetAt').size()

plt.figure(figsize=(10,4))
tweet_per_day.plot(kind='line')
plt.title('Jumlah Tweet per Hari selama COVID-19')
plt.xlabel('Tanggal')
plt.ylabel('Jumlah Tweet')
plt.grid(True)
plt.show()

# =====================================================
# 8. Word Cloud: kata yang sering muncul
# =====================================================

# Gabungkan semua teks tweet jadi satu string besar
text = " ".join(tweet for tweet in data['OriginalTweet'].astype(str))

# Buat word cloud
wordcloud = WordCloud(width=1000, height=600, background_color='white', colormap='inferno').generate(text)

# Tampilkan
plt.figure(figsize=(10,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dari Tweet COVID-19')
plt.show()


ModuleNotFoundError: No module named 'wordcloud'

## Data Text Processing

In [None]:
# 1. Tokenisasi
# 2. Lemmatization
# 3. Stemming
# 4. Stopword removal (Tanda baca, angka dan kata)
# 5. Text Normalisasi
# 6. Matrix correlation (opsional)
# 7. Labeling data (Lexicon, Bert, atau polarity)
# 8. Text Vektorisasi
# 9. Data splitting dengan skala (0.8, 02)(0.9, 0.1)(0.75, 0.25)(0.85, 0.15) pilih salah satu dari beberapa skala yang ditentukan

In [None]:
# =====================================================
# Analisis Sentimen Tweet COVID-19 (Tahap Preprocessing)
# =====================================================

# 1Ô∏è‚É£ Import Library
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Download resource NLTK yang diperlukan (jalankan sekali saja)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# =====================================================
# 2Ô∏è‚É£ Load Dataset
# =====================================================
data = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
data = data[['OriginalTweet', 'Sentiment']]  # Ambil kolom penting
data.dropna(inplace=True)

# =====================================================
# 3Ô∏è‚É£ Text Cleaning (Stopword removal, tanda baca, angka, kata)
# =====================================================

def clean_text(text):
    text = str(text).lower()                           # ubah ke huruf kecil
    text = re.sub(r"http\S+|www\S+|https\S+", '', text) # hapus URL
    text = re.sub(r'@\w+|#', '', text)                 # hapus mention dan hashtag
    text = re.sub(r'[^a-z\s]', '', text)               # hapus angka & tanda baca
    text = re.sub(r'\s+', ' ', text).strip()           # hapus spasi berlebih
    return text

data['clean_tweet'] = data['OriginalTweet'].apply(clean_text)

# =====================================================
# 4Ô∏è‚É£ Tokenisasi
# =====================================================
from nltk.tokenize import word_tokenize

data['tokens'] = data['clean_tweet'].apply(word_tokenize)

# =====================================================
# 5Ô∏è‚É£ Stopword Removal
# =====================================================
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

data['tokens_nostop'] = data['tokens'].apply(remove_stopwords)

# =====================================================
# 6Ô∏è‚É£ Lemmatization
# =====================================================
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

data['lemma_tokens'] = data['tokens_nostop'].apply(lemmatize_tokens)

# =====================================================
# 7Ô∏è‚É£ Stemming
# =====================================================
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

data['stem_tokens'] = data['lemma_tokens'].apply(stem_tokens)

# =====================================================
# 8Ô∏è‚É£ Text Normalization (gabungkan kembali jadi kalimat bersih)
# =====================================================
data['normalized_text'] = data['stem_tokens'].apply(lambda x: ' '.join(x))

# =====================================================
# 9Ô∏è‚É£ Labeling Data
# =====================================================

# Karena dataset sudah punya kolom 'Sentiment', kita bisa ubah label teks menjadi angka.
# Lexicon atau model BERT bisa digunakan kalau tidak ada label, tapi di dataset ini sudah tersedia.

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Sentiment'])

# Cek hasil label
print(label_encoder.classes_)
print(data[['Sentiment', 'label']].head())

# =====================================================
# üîü Text Vectorization (TF-IDF)
# =====================================================
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['normalized_text']).toarray()
y = data['label']

print("Shape X:", X.shape)
print("Shape y:", y.shape)

# =====================================================
# 11Ô∏è‚É£ Data Splitting
# =====================================================
# Pilih salah satu skala split (misal: 0.8 train, 0.2 test)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Data Train:", X_train.shape)
print("Data Test:", X_test.shape)

# =====================================================
# 12Ô∏è‚É£ (Opsional) Matrix Correlation
# =====================================================
# Untuk teks, kita bisa lihat korelasi antar label (frekuensi)

plt.figure(figsize=(8,5))
sns.countplot(x='Sentiment', data=data, order=data['Sentiment'].value_counts().index, palette='coolwarm')
plt.title("Distribusi Sentimen Setelah Labeling")
plt.xticks(rotation=15)
plt.show()


## Data Modeling

In [None]:
# Berikan Penjelasan tentang model yang dipilih

In [None]:
# =====================================================
# MODELING: Logistic Regression untuk Analisis Sentimen
# =====================================================

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Membuat dan melatih model
model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')
model.fit(X_train, y_train)

# Prediksi data test
y_pred = model.predict(X_test)

# Evaluasi performa
print("Akurasi Model:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


## Data Evaluasi

In [None]:
# 1. Confussion Matrix 
# 2. Laporann Klasifikasi (Classification report)

In [None]:
# =====================================================
# Evaluasi Model Logistic Regression
# =====================================================

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Melakukan prediksi pada data uji
y_pred = model.predict(X_test)

# =====================================================
# 1Ô∏è‚É£ Confusion Matrix
# =====================================================

# Membuat confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Menampilkan dalam bentuk angka
print("Confusion Matrix (Angka):")
print(cm)

# Menampilkan dalam bentuk heatmap (grafik visual)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Penjelasan:
# - Sumbu X = label hasil prediksi model
# - Sumbu Y = label sebenarnya dari data
# - Nilai diagonal = jumlah prediksi yang benar
# - Nilai di luar diagonal = jumlah prediksi yang salah


# =====================================================
# 2Ô∏è‚É£ Laporan Klasifikasi (Classification Report)
# =====================================================

# Membuat classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print("Classification Report:")
print(report)

# Penjelasan setiap metrik:
# - precision : proporsi prediksi positif yang benar
# - recall    : seberapa banyak label positif yang berhasil ditemukan model
# - f1-score  : rata-rata harmonis precision dan recall (indikator utama)
# - support   : jumlah data sebenarnya di setiap kelas


# =====================================================
# 3Ô∏è‚É£ Akurasi Keseluruhan (Opsional)
# =====================================================
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi Model Keseluruhan:", round(accuracy * 100, 2), "%")


## Data Prediksi

In [None]:
# Gunakan model yang sudah dibuat dan prediksi pada data baru

In [None]:
# =====================================================
# PREDIKSI DATA BARU MENGGUNAKAN MODEL YANG SUDAH DILATIH
# =====================================================

import pandas as pd

# Contoh data baru (tweet atau teks baru yang akan diuji)
data_baru = [
    "I am so happy that the vaccine is finally here!",
    "The government is handling the pandemic very poorly.",
    "Stay safe everyone, we will get through this together.",
    "I lost my job because of COVID-19, everything feels hopeless.",
    "Covid situation seems under control now."
]

# Membuat DataFrame untuk kemudahan tampilan
df_baru = pd.DataFrame(data_baru, columns=["tweet"])

# =====================================================
# 1Ô∏è‚É£ PREPROCESSING DATA BARU
# =====================================================
# Tahapan preprocessing harus sama dengan yang diterapkan pada data training:
# - lowercase, hapus tanda baca, stopwords, stemming/lemmatization, dll.

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Pastikan library sudah diunduh
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()                                 # ubah ke huruf kecil
    text = re.sub(r'http\S+|www\S+', '', text)          # hapus URL
    text = re.sub(r'[^a-z\s]', '', text)                # hapus angka & tanda baca
    tokens = text.split()                               # tokenisasi sederhana
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Terapkan fungsi ke semua tweet baru
df_baru["clean_tweet"] = df_baru["tweet"].apply(clean_text)

# =====================================================
# 2Ô∏è‚É£ VEKTORISASI TEKS BARU
# =====================================================
# Gunakan vektorisasi (TF-IDF) yang sama dengan yang digunakan saat training
# Jangan fit ulang vectorizer, hanya gunakan transform()
X_baru = tfidf.transform(df_baru["clean_tweet"])

# =====================================================
# 3Ô∏è‚É£ PREDIKSI SENTIMEN
# =====================================================
# Gunakan model Logistic Regression yang sudah dilatih
prediksi_baru = model.predict(X_baru)

# Ubah kembali label numerik ke label asli (positif, negatif, dll.)
hasil_prediksi = label_encoder.inverse_transform(prediksi_baru)

# Tambahkan hasil prediksi ke DataFrame
df_baru["predicted_sentiment"] = hasil_prediksi

# =====================================================
# 4Ô∏è‚É£ TAMPILKAN HASIL
# =====================================================
print("=== HASIL PREDIKSI DATA BARU ===")
print(df_baru[["tweet", "predicted_sentiment"]])
