<a href="https://colab.research.google.com/github/RidhoSuharis/machine-learning-project-python/blob/main/FatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import nltk
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
sentences = ["Saya suka belajar data science.", "Python adalah bahasa pemrograman yang populer.", "Saya menggunakan Python untuk analisis data.", "Analisis data membantu dalam pengambilan keputusan.", "Machine learning adalah cabang dari kecerdasan buatan.", "Algoritma machine learning bisa memprediksi hasil.", "Saya tertarik pada teknologi baru.", "Kecerdasan buatan memiliki banyak aplikasi.", "Belajar data science sangat menarik.", "Saya mengikuti kursus online tentang machine learning."
]


In [15]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(text.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    ps = PorterStemmer()
    stemmed_tokens = [ps.stem(word) for word in filtered_tokens]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    pos_tagged_tokens1 = pos_tag(stemmed_tokens)
    pos_tagged_tokens2 = pos_tag(lemmatized_tokens)
    return {
        "original_text": text,
        "tokens": tokens,
        "filtered_tokens": filtered_tokens,
        "stemmed_tokens": stemmed_tokens,
        "lemmatized_tokens": lemmatized_tokens,
        "pos_tagged_tokens1": pos_tagged_tokens1,
        "pos_tagged_tokens2": pos_tagged_tokens2,
        }

In [16]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(text.maketrans("", "", string.punctuation))
    text = re.sub(r'\W', ' ', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]

In [17]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(preprocessed_sentences)
print("Vocabulary:\n", vectorizer.get_feature_names_out())
print('\nVector vocabulary:\n',vectorizer.vocabulary_)
print("\nBag of Words:\n", X_bow.toarray())

Vocabulary:
 ['algoritma' 'analisi' 'aplikasi' 'bahasa' 'belajar' 'buatan' 'cabang'
 'data' 'hasil' 'kecerdasan' 'keputusan' 'kursu' 'learn' 'machin'
 'membantu' 'memiliki' 'memprediksi' 'menarik' 'mengikuti' 'onlin'
 'pemrograman' 'pengambilan' 'popul' 'python' 'scienc' 'suka' 'teknolog'
 'tertarik']

Vector vocabulary:
 {'suka': 25, 'belajar': 4, 'data': 7, 'scienc': 24, 'python': 23, 'bahasa': 3, 'pemrograman': 20, 'popul': 22, 'analisi': 1, 'membantu': 14, 'pengambilan': 21, 'keputusan': 10, 'machin': 13, 'learn': 12, 'cabang': 6, 'kecerdasan': 9, 'buatan': 5, 'algoritma': 0, 'memprediksi': 16, 'hasil': 8, 'tertarik': 27, 'teknolog': 26, 'memiliki': 15, 'aplikasi': 2, 'menarik': 17, 'mengikuti': 18, 'kursu': 11, 'onlin': 19}

Bag of Words:
 [[0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0

In [18]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_sentences)
print("Vocabulary:\n", tfidf_vectorizer.get_feature_names_out())
print('\nVector vocabulary:\n',tfidf_vectorizer.vocabulary_)
print("\nTF-IDF:\n", X_tfidf.toarray())

Vocabulary:
 ['algoritma' 'analisi' 'aplikasi' 'bahasa' 'belajar' 'buatan' 'cabang'
 'data' 'hasil' 'kecerdasan' 'keputusan' 'kursu' 'learn' 'machin'
 'membantu' 'memiliki' 'memprediksi' 'menarik' 'mengikuti' 'onlin'
 'pemrograman' 'pengambilan' 'popul' 'python' 'scienc' 'suka' 'teknolog'
 'tertarik']

Vector vocabulary:
 {'suka': 25, 'belajar': 4, 'data': 7, 'scienc': 24, 'python': 23, 'bahasa': 3, 'pemrograman': 20, 'popul': 22, 'analisi': 1, 'membantu': 14, 'pengambilan': 21, 'keputusan': 10, 'machin': 13, 'learn': 12, 'cabang': 6, 'kecerdasan': 9, 'buatan': 5, 'algoritma': 0, 'memprediksi': 16, 'hasil': 8, 'tertarik': 27, 'teknolog': 26, 'memiliki': 15, 'aplikasi': 2, 'menarik': 17, 'mengikuti': 18, 'kursu': 11, 'onlin': 19}

TF-IDF:
 [[0.         0.         0.         0.         0.5007009  0.
  0.         0.3894615  0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.5007009

In [19]:
# Convert to a DataFrame
import pandas as pd
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df


Unnamed: 0,algoritma,analisi,aplikasi,bahasa,belajar,buatan,cabang,data,hasil,kecerdasan,...,mengikuti,onlin,pemrograman,pengambilan,popul,python,scienc,suka,teknolog,tertarik
0,0.0,0.0,0.0,0.0,0.500701,0.0,0.0,0.389462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.500701,0.588997,0.0,0.0
1,0.0,0.0,0.0,0.518291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.518291,0.0,0.518291,0.440595,0.0,0.0,0.0,0.0
2,0.0,0.619575,0.0,0.0,0.0,0.0,0.0,0.481926,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.619575,0.0,0.0,0.0,0.0
3,0.0,0.416798,0.0,0.0,0.0,0.0,0.0,0.324199,0.0,0.0,...,0.0,0.0,0.0,0.490297,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.451081,0.530627,0.0,0.0,0.451081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.493488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107
7,0.0,0.0,0.538748,0.0,0.0,0.457985,0.0,0.0,0.0,0.457985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.500701,0.0,0.0,0.389462,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.500701,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.493488,0.493488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print("\nTF-IDF:\n", X_tfidf.toarray())
print("\nTF-IDF Doc0:\n", X_tfidf[0].toarray())


In [None]:
query = "data"
query_vector = tfidf_vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vector, X_tfidf)
similarity_scores = cosine_similarities[0]
sorted_indices = np.argsort(similarity_scores)[::-1]
top_5_indices = sorted_indices[:5]
print("Top 5 sentences similar to 'data':")
for index in top_5_indices:
    print(f"Document {index + 1}: {sentences[index]} (Similarity score: {similarity_scores[index]})")

In [None]:
cosine_similarities = cosine_similarity(query_vector, X_tfidf).flatten()
# Mengurutkan dokumen berdasarkan cosine similarity score secara descending
related_docs_indices = cosine_similarities.argsort()[::-1]
# Menampilkan dokumen dan skornya
print("Sentences similar to 'data':\n")
for i in related_docs_indices:
    print(f"Document {i + 1}: {sentences[i]}")
    print(f"Similarity Skor: {cosine_similarities[i]:.4f}")
    print("-" * 20)