In [56]:
import numpy as np 
import pandas as pd 
from wordcloud import WordCloud #Word visualization
import matplotlib.pyplot as plt #Plotting properties
import seaborn as sns #Plotting properties
from sklearn.feature_extraction.text import CountVectorizer #Data transformation
from sklearn.model_selection import train_test_split #Data testing
from sklearn.linear_model import LogisticRegression #Prediction Model
from sklearn.metrics import accuracy_score #Comparison between real and predicted
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder #Variable encoding and decoding for XGBoost
import re #Regular expressions
import nltk
from nltk import word_tokenize


In [57]:
df = pd.read_csv("twitter_training.csv")

In [58]:
df.columns=['id','entity','type','content']
df.head()

Unnamed: 0,id,entity,type,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [59]:
train_df=df
train_df

Unnamed: 0,id,entity,type,content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [60]:
#Text transformation
train_df["lower"]=train_df.content.str.lower()
train_df["lower"]=[str(df) for df in train_df.lower]
train_df["lower"]=train_df.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x))

In [61]:
train_df.head()

Unnamed: 0,id,entity,type,content,lower
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...


In [62]:
tokens_content = [word_tokenize(str(word)) for word in train_df.lower]
tokens_counter = [item for sublist in tokens_content for item in sublist]
print("Number of tokens: ", len(set(tokens_counter)))

Number of tokens:  30436


In [63]:
tokens_content[1]

['im',
 'getting',
 'on',
 'borderlands',
 'and',
 'i',
 'will',
 'kill',
 'you',
 'all']

In [64]:
stopwords_nltk = nltk.corpus.stopwords
stop_words = stopwords_nltk.words('english')
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [65]:
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=stop_words,
    ngram_range=(1, 1),
    max_features=2000  # Contoh: Ambil 5000 fitur teratas berdasarkan frekuensi
)


In [81]:
X_train, X_test = train_test_split(df, test_size = 0.3, random_state=0)

In [67]:
X_train_bow = bow_counts.fit_transform(X_train.lower)
X_test_bow = bow_counts.transform(X_test.lower)

  % sorted(inconsistent)


In [68]:
y_train_bow = X_train['type']
y_test_bow = X_test['type']

In [69]:
X_train_bow = X_train_bow.toarray()
X_test_bow = X_test_bow.toarray()

In [75]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Multinomial Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_bow_sparse, y_train_bow)
nb_pred = nb_model.predict(X_test_bow_sparse)

# Classification Report
print("Multinomial Naive Bayes Classification Report")
print(metrics.classification_report(y_test_bow, nb_pred))

# Confusion Matrix
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test_bow, nb_pred))

# Accuracy
print('Multinomial Naive Bayes Accuracy : {:.2f}%'.format(metrics.accuracy_score(y_test_bow, nb_pred) * 100))

# Count of each category in predictions
unique, counts = np.unique(nb_pred, return_counts=True)
print("Category Counts: ", dict(zip(unique, counts)))



Multinomial Naive Bayes Classification Report
              precision    recall  f1-score   support

  Irrelevant       0.82      0.62      0.71      3848
    Negative       0.71      0.84      0.77      6754
     Neutral       0.81      0.65      0.72      5467
    Positive       0.72      0.80      0.76      6336

    accuracy                           0.75     22405
   macro avg       0.76      0.73      0.74     22405
weighted avg       0.75      0.75      0.74     22405

Confusion Matrix
[[2379  620  229  620]
 [ 168 5667  329  590]
 [ 202  916 3574  775]
 [ 150  800  294 5092]]
Multinomial Naive Bayes Accuracy : 74.59%
Category Counts:  {'Irrelevant': 2899, 'Negative': 8003, 'Neutral': 4426, 'Positive': 7077}


In [78]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Gaussian Naive Bayes Model
bayes_model = GaussianNB()

# Fit the model on the training data
bayes_model.fit(X_train_bow, y_train_bow)

# Make predictions on the test data
bayes_prediction = bayes_model.predict(X_test_bow)

# Classification Report
print("Gaussian Naive Bayes Classification Report")
print(metrics.classification_report(y_test_bow, bayes_prediction))

# Confusion Matrix
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test_bow, bayes_prediction))

# Accuracy
print('Gaussian Naive Bayes Accuracy : {:.2f}%'.format(metrics.accuracy_score(y_test_bow, bayes_prediction) * 100))

# Count of each category in predictions
unique, counts = np.unique(bayes_prediction, return_counts=True)
print("Category Counts: ", dict(zip(unique, counts)))


Gaussian Naive Bayes Classification Report
              precision    recall  f1-score   support

  Irrelevant       0.25      0.84      0.39      3848
    Negative       0.72      0.37      0.49      6754
     Neutral       0.66      0.27      0.39      5467
    Positive       0.53      0.33      0.41      6336

    accuracy                           0.42     22405
   macro avg       0.54      0.45      0.42     22405
weighted avg       0.57      0.42      0.42     22405

Confusion Matrix
[[3243  171  147  287]
 [3130 2508  334  782]
 [2763  443 1483  778]
 [3600  370  270 2096]]
Gaussian Naive Bayes Accuracy : 41.64%
Category Counts:  {'Irrelevant': 12736, 'Negative': 3492, 'Neutral': 2234, 'Positive': 3943}


In [89]:
# Function to get top N words per category
def get_top_n_words(vectorizer, X, y, n=10):
    words = vectorizer.get_feature_names_out()
    categories = np.unique(y)
    top_words = {}

    for category in categories:
        word_counts = X[y == category].sum(axis=0)
        top_indices = word_counts.argsort()[::-1][:n]
        top_words[category] = [words[i] for i in top_indices]

    return top_words

top_words = get_top_n_words(bow_counts, X_train_bow, y_train_bow)

print("\nTop words per category:")
for category, words in top_words.items():
    print(f"{category}: {', '.join(words)}")

# Combine test set with predictions
X_test_combined = X_test.copy()
X_test_combined['nb_pred'] = nb_pred
X_test_combined['bayes_pred'] = bayes_prediction

# Function to get example sentences per category
def get_example_sentences(df, predictions, category, n=3):
    category_indices = (predictions == category) & (df['type'] == category)
    sentences = df[category_indices]['content'].head(n).tolist()
    return sentences

# Get example sentences for Multinomial Naive Bayes
print("\nExample sentences per category (Multinomial Naive Bayes):")
categories = ['negative', 'neutral', 'positive']
for category in categories:
    print(f"\n{category.capitalize()} sentences:")
    examples = get_example_sentences(X_test_combined, X_test_combined['nb_pred'], category)
    for i, sentence in enumerate(examples, 1):
        print(f"{i}. {sentence}")




Top words per category:
Irrelevant: com, player, ban, like, see, game, people, 4, good, love
Negative: game, com, get, like, shit, twitter, pic, fix, fuck, play
Neutral: com, johnson, 2, co, amazon, https, game, facebook, twitter, pic
Positive: game, com, love, good, 2, like, pic, really, new, twitter

Example sentences per category (Multinomial Naive Bayes):

Negative sentences:

Neutral sentences:

Positive sentences:

Example sentences per category (Gaussian Naive Bayes):

Negative sentences:

Neutral sentences:

Positive sentences:
