<a href="https://colab.research.google.com/github/Rizukaf-id/DRPM-machine-learning/blob/main/drpm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

## Load Data

In [None]:
# file from google colab
# file = "/content/drive/MyDrive/DRPM 2024/code/data/after_preprocessed_data_with_emoji.csv"

# file from kaggle
file = "/kaggle/input/dataset-drpm/after_preprocessed_data_with_emoji - after_preprocessed_data_with_emoji.csv"
df = pd.read_csv(file)
df

In [None]:
df["text"] == df["new text"]
similar_text = df["text"] == df["new text"]
count_similar_text = similar_text.value_counts()
count_similar_text

## Preprocess

In [None]:
df['emoji_berulang'] = df['emoji_berulang'].apply(lambda x: 0 if x == False else 1)
df_emoji_berulang = df
df_emoji_berulang

In [None]:
def has_repeated_words(text):
    words = text.split()
    for i in range(len(words) - 1):
        if words[i] == words[i + 1]:
            return 1
    return 0
def has_repeated_chars(text):
    return 1 if re.search(r'(.)\1{2,}', text) else 0
def has_repeated_symbols(text):
    return 1 if re.search(r'([!?,.#$%&*+=\-/\\:;<>@[\]^_`{|}~])\1{1,}', text) else 0
def has_repeated_combinations(text):
    return 1 if re.search(r'([a-zA-Z]{2,})\1+', text) else 0

In [None]:
df['is_repeat_word'] = df['text'].apply(has_repeated_words)
df['is_repeat_char'] = df['text'].apply(has_repeated_chars)
df['is_repeat_symbol'] = df['text'].apply(has_repeated_symbols)
df['is_repeat_combination'] = df['text'].apply(has_repeated_combinations)
df

In [None]:
df.shape

### Move columns

In [None]:
anotasi = df.pop('anotasi_gpt')
df.insert(11, 'anotasi', anotasi)
df

In [None]:
emoji_berulang = df.pop('emoji_berulang')
df.insert(9, 'is_repeat_emoji', emoji_berulang)
df

Drop unused columns

In [None]:
df = df.drop(columns=[
    'text', 'new text', 'cleaning', 'case_folding', 'stopword_removal'
], axis=1)
df

In [None]:
df.tail(100)

In [None]:
print(df.duplicated().sum()) # apakah dihapus?

### split train test

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(df_train.shape, df_test.shape)
df_train

### Transforming to TF-IDF

In [None]:
# Check for missing values in the 'stemming' column
print(df_train['stemming'].isnull().sum())

# Remove rows with missing values in the 'stemming' column
df_train = df_train.dropna(subset=['stemming'])

# Reset the index after removing rows
df_train = df_train.reset_index(drop=True)

# Fit the TfidfVectorizer on the cleaned data
vectorizer = TfidfVectorizer()
text_feature_train = vectorizer.fit_transform(df_train['stemming'])

# Check the shape of the transformed data
print(text_feature_train.shape)

In [None]:
feature_train = np.concatenate([text_feature_train.toarray(),
                                df_train[[
                                    'is_repeat_word', 'is_repeat_char',
                                    'is_repeat_symbol', 'is_repeat_emoji', 'is_repeat_combination'
                                ]].values
                                ], axis=1)
feature_train.shape

### PCA

In [None]:
pca = PCA(n_components=2)
feature_train = pca.fit_transform(feature_train)
feature_train.shape

In [None]:
df_feature_train = pd.DataFrame(feature_train)
df_feature_train = pd.concat([df_feature_train, df_train[['anotasi']]], axis=1, ignore_index=True)
df_feature_train.columns = ['pca_1', 'pca_2', 'anotasi']
df_feature_train.describe(include='all')

In [None]:
Classifier = [
    DecisionTreeClassifier(),
    SVC(),
    GaussianNB()
]