In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from openai import AzureOpenAI
import tiktoken

class Llm:
    @staticmethod
    def summarize_sentiments(csv_file_path_positif, csv_file_path_negatif):
        # Membaca data dari CSV
        df_positif = pd.read_csv(csv_file_path_positif)
        df_negatif = pd.read_csv(csv_file_path_negatif)
          # Menghapus baris dengan nilai NaN di kolom 'processed'
        df_positif = df_positif.dropna(subset=['processed'])
        df_negatif = df_negatif.dropna(subset=['processed'])
        
        # Memisahkan teks berdasarkan sentimen
        positive_texts = df_positif['processed'].tolist()
        negative_texts = df_negatif['processed'].tolist()

        # TF-IDF vectorization
        vectorizer = TfidfVectorizer()
        positive_tfidf = vectorizer.fit_transform(positive_texts)
        negative_tfidf = vectorizer.fit_transform(negative_texts)
        print(positive_tfidf.shape)
        
        # Mengambil fitur (kata-kata) dan skor tf-idf tertinggi
        def extract_top_keywords(tfidf_matrix, feature_names, top_n=100):
            top_keywords = {}
            for row in tfidf_matrix:
                indices = row.indices
                scores = row.data
                for index, score in zip(indices, scores):
                    if feature_names[index] in top_keywords:
                        top_keywords[feature_names[index]] = max(top_keywords[feature_names[index]], score)
                    else:
                        top_keywords[feature_names[index]] = score
            
            sorted_keywords = sorted(top_keywords.items(), key=lambda x: x[1], reverse=True)[:top_n]
            return " ".join([word for word, score in sorted_keywords])
        
        positive_top_keywords = extract_top_keywords(positive_tfidf, vectorizer.get_feature_names_out())
        negative_top_keywords = extract_top_keywords(negative_tfidf, vectorizer.get_feature_names_out())

        print(positive_top_keywords)
        print(len(positive_top_keywords))
        print(negative_top_keywords)
        print(len(negative_top_keywords))

        # Define max tokens per request and tokenizer
        max_tokens = 2000  # Adjust to a reasonable size to balance chunking and API call overhead
        tokenizer = tiktoken.get_encoding("cl100k_base")

        # Function to split text into chunks based on token count
        def split_text_into_chunks(text, max_tokens, tokenizer):
            words = text.split()
            chunks = []
            current_chunk = []

            for word in words:
                current_chunk.append(word)
                if len(tokenizer.encode(" ".join(current_chunk))) > max_tokens:
                    current_chunk.pop()
                    chunks.append(" ".join(current_chunk))
                    current_chunk = [word]

            if current_chunk:
                chunks.append(" ".join(current_chunk))

            return chunks

        # Create prompts and process each chunk
        def create_prompt(sentiment, summary_text, keyword):
            role = "AI Linguistik"
            action = "Tambahkan konteks tambahan yang menjelaskan mengapa pandangan tertentu dipegang oleh sebagian orang berdasarkan sentimen positif atau negatif terhadap suatu topik"
            step = "menganalisis teks hasil preprocessing untuk menemukan kata-kata kunci yang paling relevan dan menyusun kalimat yang memberikan konteks tambahan"
            context = f"menambahkan konteks tambahan mengapa orang-orang memandang {sentiment} terhadap topik {keyword}"
            example = f"Misalnya, jika teks memiliki sentimen {sentiment}, Anda akan menambahkan konteks tambahan dengan mengambil 3 poin utama mengapa orang-orang memandang {sentiment} terhadap topik ini. Jelaskan alasan-alasan ini dengan menghubungkannya ke kata-kata kunci yang paling relevan."
            format_str = "dalam format poin-poin bullet"

            # Combine components into the RASCEF prompt
            prompt = (f"# RASCEF = Role + ( Action + Step + Context + Example ) + Format\n\n"
                    f"Anda adalah {role} yang {action} {step} {context}. {example} Buatkan {format_str} dari teks berikut:\n\n"
                    f"{summary_text}")
            return prompt

        def summarize_chunks(sentiment, chunks, keyword):
            summaries = []
            for chunk in chunks:
                prompt = create_prompt(sentiment, chunk, keyword)
                response = client.chat.completions.create(
                    model="aicdeploymodel",  # Replace with your deployment name
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ],
                    max_tokens=100,
                    temperature=0.3
                )
                summary = response.choices[0].message.content.strip()
                summaries.append(summary)
            return " ".join(summaries)

        # Initialize Azure OpenAI client
        client = AzureOpenAI(
            api_version="2023-05-15",
            azure_endpoint="https://chatbot-aic.openai.azure.com/",
            api_key="fd068a3036e34fe188a28392699ecc65",
        )

        # Generate summary for positive sentiment
        positive_chunks = split_text_into_chunks(positive_top_keywords, max_tokens, tokenizer)
        positive_summary = summarize_chunks('positif', positive_chunks, "impor susu")

        # Generate summary for negative sentiment
        negative_chunks = split_text_into_chunks(negative_top_keywords, max_tokens, tokenizer)
        negative_summary = summarize_chunks('negatif', negative_chunks, "impor susu")

        # Return summaries for both sentiments
        return {
            'positive_summary': positive_summary,
            'negative_summary': negative_summary
        }


In [14]:
# Example usage
summaries = Llm.summarize_sentiments('./hasil-klasifikasi/impor_susu_sapi_jan_mei_pos.csv', './hasil-klasifikasi/impor_susu_sapi_jan_mei_neg.csv')
print("Positive Summary:\n", summaries['positive_summary'])
print("Negative Summary:\n", summaries['negative_summary'])



(99, 679)
jelek ideal after debat alasan ailan gede komponen dagingnya korban burung kuning idul dana kuota cegah bumn aman hukum ketahanan hasil konstitusi basah jakarta cuannya dpet karantina boyolali barang kerjanya konsepnya lelang dibahas dalem bisnis durung handle let bea celeng gandum diledek kecil jelas bilang diprotes jutaan hamil ikam ajak berdampak ayo bem diam jamin laman cik gjlas beranak buruh india ambisius lactose hutang berpmk australia elite intoleran dukungan listrik jenis ahli didengarkan kelen badan kelilmuannya kuliah hasto jagung acak goblog lambat babi gegara blunder ketua bapakkau hubungan cocok cengengas food gratisblablabla kandungan kanada harga demand dibantu ditenderkan gyuldangies folat
716
hayo ranum disuruh perah bilang benar bukan marah su beras pangan mencre anak tahun kasihan hmm jpg mendukung juta jack shi anjing penyediaan omon kata selandia bahaya pang jangan bisnis dibully orang kecil kompor kerbau asia india import rakyat impornya permudah mengu