In [48]:
import os
import string
import pandas as pd
import numpy as np
import re
from collections import Counter
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

dataset_path = os.path.realpath(os.path.join(os.path.dirname(__name__), '..', 'app','static','uploads', 'dataset.csv'))
result_path = os.path.realpath(os.path.join(os.path.dirname(__name__), '..', 'notebook', 'result'))

class TFIDFProcessor:
    def __init__(self):
        # Initialize Sastrawi tools
        self.stemmer = StemmerFactory().create_stemmer()
        stopword_factory = StopWordRemoverFactory()
        self.combined_stopwords = set(stopword_factory.get_stop_words()).union(set(stopwords.words('english')))
        self.terms = None  # To store terms after processing corpus

    def clean_text(self, text):
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text

    def preprocess_text(self, text):
        text = self.clean_text(text).lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in self.combined_stopwords]
        stemmed = [self.stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed)

    def compute_raw_tf(self, doc):
        words = doc.split()
        count = Counter(words)
        return count
    
    def compute_tf(self, doc):
        words = doc.split()
        count = Counter(words)
        total_terms = len(words)
        tf = {term: float(count[term]) / total_terms for term in count}
        return tf

    def compute_idf(self, corpus):
        N = len(corpus)
        idf_dict = {}
        all_words = set(word for doc in corpus for word in doc.split())
        
        for word in all_words:
            containing_docs = sum(1 for doc in corpus if word in doc.split())
            idf_dict[word] = float(np.log((1+N) / (1 + containing_docs)) + 1)  # Make sure IDF is a float
        
        return idf_dict

    def compute_tfidf(self, tf_dict, idf_dict):
        tfidf_dict = {}
        for word, tf_value in tf_dict.items():
            tfidf_dict[word] = float(tf_value) * float(idf_dict.get(word, 0.0))  # Ensure both are floats
        
        return tfidf_dict

    def process_corpus(self, corpus):
        # Compute IDF for the corpus
        idf_values = self.compute_idf(corpus)
        
        # Get the terms (features)
        self.terms = sorted(idf_values.keys())
        
        # Compute raw TF, normalized TF, and TF-IDF for each document
        raw_tf_dicts = [self.compute_raw_tf(doc) for doc in corpus]
        tf_dicts = [self.compute_tf(doc) for doc in corpus]
        
        # Ensure each document's TF-IDF vector contains all terms
        tfidf_dicts = []
        for tf_dict in tf_dicts:
            tfidf_dict = {}
            for term in self.terms:
                tfidf_dict[term] = tf_dict.get(term, 0) * idf_values.get(term, 0)
            tfidf_dicts.append(tfidf_dict)
        
        # Convert dictionaries to DataFrames for easy manipulation and export
        raw_tf_df = pd.DataFrame(raw_tf_dicts, index=[f'D{i+1}' for i in range(len(corpus))]).T
        tf_df = pd.DataFrame(tf_dicts, index=[f'D{i+1}' for i in range(len(corpus))]).T
        tfidf_df = pd.DataFrame(tfidf_dicts, index=[f'D{i+1}' for i in range(len(corpus))]).T
        
        # Fill NaN values with 0
        raw_tf_df = raw_tf_df.fillna(0)
        tf_df = tf_df.fillna(0)
        tfidf_df = tfidf_df.fillna(0)
        
        return self.terms, raw_tf_df, tf_df, tfidf_df, idf_values


class SVMClassifier:
    def __init__(self):
        self.weights = None
        self.bias = None

    def train_svm(self, X_train, y_train, lr=0.0001, epochs=1000, C=1.0):
        num_samples, num_features = X_train.shape
        weights = np.zeros(num_features)
        bias = 0

        # Gradient descent for SVM
        for epoch in range(epochs):
            for i in range(num_samples):
                condition = y_train[i] * (np.dot(X_train[i], weights) - bias) >= 1
                if condition:
                    weights -= lr * (2 * C * weights)  # Regularization term
                else:
                    weights -= lr * (2 * C * weights - np.dot(X_train[i], y_train[i]))
                    bias -= lr * y_train[i]
        
        self.weights, self.bias = weights, bias

    def predict(self, X):
        linear_output = np.dot(X, self.weights) - self.bias
        return np.sign(linear_output)  # Predict either 1 or -1 based on the sign of the linear output

    def compute_decision_values(self, X):
        decision_values = np.dot(X, self.weights) - self.bias
        return decision_values

    def get_weights_bias(self):
        return self.weights, self.bias


class TextClassifier:
    def __init__(self, dataset_path, result_path):
        self.dataset_path = dataset_path
        self.result_path = result_path
        
        # Initialize processors
        self.tfidf_processor = TFIDFProcessor()
        self.svm_classifier = SVMClassifier()
        
        # Load dataset
        self.df_comments = pd.read_csv(self.dataset_path)
        
        # Preprocess text
        self.df_comments['preprocess'] = self.df_comments['comment'].apply(self.tfidf_processor.preprocess_text)

    def train_model(self):
        # Preprocess all text in the dataset
        corpus = self.df_comments['preprocess'].tolist()
        
        # Process corpus for TF-IDF
        self.terms, raw_tf_df, tf_df, tfidf_df, idf_values = self.tfidf_processor.process_corpus(corpus)
        
        # Create Document Frequency (DF)
        df_values = (raw_tf_df > 0).sum(axis=1)
        
        # Create final DataFrame
        final_df = pd.DataFrame(index=self.terms)
        final_df['Terms'] = self.terms
        final_df = final_df.join(raw_tf_df.add_prefix('TF'))  # Add raw term counts for each document
        final_df = final_df.join(tf_df.add_prefix('TFN'))  # Add normalized TF for each document
        final_df = final_df.join(tfidf_df.add_prefix('TFIDF'))  # Add TF-IDF for each document
        
        # Add Document Frequency (DF) and IDF values
        final_df['DF'] = df_values
        final_df['IDF'] = [idf_values.get(term, 0) for term in self.terms]
        
        # Round all numeric columns to 3 decimal places
        final_df = final_df.round(3)
        
        # Export final DataFrame to CSV
        final_df.to_csv(f'{self.result_path}/train_metrics.csv', index=False)
        
        # Prepare training data for SVM
        X = np.array([list(tfidf_df.loc[:, f'D{i+1}']) for i in range(len(corpus))])
        y = self.df_comments['label'].values
        
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        
        # Train SVM manually
        self.svm_classifier.train_svm(X_train, y_train)

        # Predict on test set
        y_pred = self.svm_classifier.predict(X_test)
        
        # Calculate metrics
        metrics = self.calculate_metrics(y_test, y_pred)
        
        # Export metrics to CSV
        metrics_df = pd.DataFrame(metrics, index=[0])
        metrics_df.to_csv(f'{self.result_path}/metrics.csv', index=False)
        
        # Export decision values to CSV
        decision_values = self.svm_classifier.compute_decision_values(X_test)
        decision_values_df = pd.DataFrame(decision_values, index=[f'D{i+1}' for i in range(len(X_test))], columns=['DecisionValue'])
        decision_values_df.to_csv(f'{self.result_path}/decision_values.csv', index=True)

        # Export weights and bias to CSV
        weights, bias = self.svm_classifier.get_weights_bias()
        weights_bias_df = pd.DataFrame({'Weights': weights, 'Bias': [bias] * len(weights)}, index=[f'Feature{i+1}' for i in range(len(weights))])
        weights_bias_df.to_csv(f'{self.result_path}/weights_bias.csv', index=True)
    
        print(f'sentiment test : {y_test}')
        print(f'sentiment pred : {y_pred}')
        
        def calculate_metrics(self, y_true, y_pred):
        # Calculate confusion matrix
        tp = np.sum((y_true == 1) & (y_pred == 1))
        tn = np.sum((y_true == -1) & (y_pred == -1))
        fp = np.sum((y_true == -1) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == -1))
        
        # Calculate metrics
        total_samples = tp + tn + fp + fn
        accuracy = (tp + tn) / total_samples if total_samples > 0 else 0
        
        # Avoid division by zero
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-Score: {f1_score}')
        print(f'ConfusionMatrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}')
        
        return {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1_score,
            'ConfusionMatrix': {'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn}
        }


# Initialize TextClassifier with paths
text_classifier = TextClassifier(dataset_path, result_path)

# Train the model and save results
text_classifier.train_model()


Accuracy =  0.48
Precision =  0.48
Recall =  1.0
F1-Score =  0.6486486486486487
ConfusionMatrix =  {'TP': np.int64(144), 'TN': np.int64(0), 'FP': np.int64(156), 'FN': np.int64(0)}
tp : 144
fn : 0
sentiment test : [-1 -1  1 -1 -1 -1  1  1  1 -1  1 -1 -1  1 -1  1 -1  1  1  1  1  1  1  1
 -1 -1  1  1 -1  1 -1 -1 -1 -1  1 -1 -1  1 -1  1  1 -1 -1 -1  1 -1 -1  1
  1 -1  1  1  1  1 -1 -1  1  1 -1  1 -1 -1  1  1  1  1  1 -1  1 -1 -1  1
 -1  1 -1  1  1 -1 -1 -1  1  1  1 -1 -1 -1  1 -1  1 -1  1  1 -1  1 -1  1
 -1  1  1 -1 -1 -1  1 -1  1 -1 -1 -1 -1  1  1 -1  1 -1  1  1 -1  1  1 -1
 -1 -1  1 -1  1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1  1 -1 -1  1
  1  1  1  1  1  1  1  1  1 -1 -1 -1  1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1
  1  1 -1 -1  1 -1  1 -1  1 -1 -1  1 -1 -1  1  1 -1 -1 -1  1 -1  1 -1 -1
  1  1 -1 -1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1
  1  1  1 -1 -1 -1 -1  1  1  1 -1 -1  1  1 -1  1 -1 -1  1  1 -1  1 -1  1
 -1  1  1  1 -1  1  1  1  1 -1  1 -1  1 -1  1 -1 -1  1 -1