In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import gspread
import plotly.express as px
import plotly.graph_objects as go

from datetime import timedelta
from google.oauth2.service_account import Credentials
from collections import defaultdict

In [5]:
# Gathere data
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']
creds = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

file_sheet_map = {
    'AI QC Inbound CRM Review 语音智能质检打标复审': ['New'],
    '[2] AI QC Inbound CRM Review 语音智能质检打标复审 ': ['NEW 2', 'New 3']
}

df_list = []

for file, sheets in file_sheet_map.items():
    for sheet_name in sheets:
        try:
            sheet = client.open(file).worksheet(sheet_name.strip())
            data = sheet.get_all_values()
            print(f"Readed {file} | {sheet_name} - {len(data)} baris")

            header = data[0]
            rows = data[1:]

            df = pd.DataFrame(rows, columns=header)
            df['Sheet Name'] = sheet_name.strip()
            df['File Name'] = file
            df_list.append(df)

        except Exception as e:
            print(f"Sumting wong {file} | Sheet: {sheet_name} - Error: {e}")

df_all = pd.concat(df_list, ignore_index=True)


Readed AI QC Inbound CRM Review 语音智能质检打标复审 | New - 131603 baris
Readed [2] AI QC Inbound CRM Review 语音智能质检打标复审  | NEW 2 - 106501 baris
Readed [2] AI QC Inbound CRM Review 语音智能质检打标复审  | New 3 - 97203 baris


In [15]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 335304 entries, 0 to 335303
Data columns (total 27 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   No                          335304 non-null  object
 1   Tanggal Pengerjaan          335304 non-null  object
 2   Checker                     335304 non-null  object
 3   ASI/AFI                     335304 non-null  object
 4   Waktu Inbound               335304 non-null  object
 5   Nomor Inbound               335304 non-null  object
 6   Nama Agent                  335304 non-null  object
 7   Call ID                     335304 non-null  object
 8   Durasi Bicara               335304 non-null  object
 9   Detik                       335304 non-null  object
 10  Total Tiket Riskan          335304 non-null  object
 11  Apakah Riskan?              335304 non-null  object
 12  Tinjauan CS                 335304 non-null  object
 13  Status                      3

In [16]:
# Data Cleaning
df_clean = df_all.copy()

# Bersihkan nama kolom dan ganti kosong menjadi 'Catatan Tambahan'
df_clean.columns = [col.strip() if col.strip() else 'Catatan Tambahan' for col in df_clean.columns]
df_clean.columns = df_clean.columns.str.strip()

# Konversi kolom tanggal
datetime_cols = ['Tanggal Pengerjaan', 'Waktu Inbound', 'Tanggal Sampling']
for col in datetime_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Konversi kolom numerik
if 'No' in df_clean.columns:
    df_clean['No'] = pd.to_numeric(df_clean['No'], errors='coerce')

# Bersihkan kolom teks
text_columns = df_clean.select_dtypes(include='object').columns
df_clean[text_columns] = df_clean[text_columns].apply(lambda x: x.astype(str).str.strip())
df_clean.replace('', 'No Data', inplace=True)

# Konversi kolom kategorikal jika ada
categorical_cols = [
    'Checker', 'ASI/AFI', 'Status',
    'Efektif', 'Kejelasan Suara', 'Suara Lain',
    'Kelengkapan Rekaman', 'Sampling user side', 
    'Hasil ASR', 'Hasil Pemeriksaan Kualitas', 'Agent Sampling'
]
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

# Hapus kolom 'Status' jika ada
df_clean.drop(columns=['Status'], errors='ignore', inplace=True)

# Hapus duplikat berdasarkan kolom 'Detik' jika ada
detik_cols = [c for c in df_clean.columns if c.strip().lower() == 'detik']
if detik_cols:
    df_clean.drop_duplicates(subset=detik_cols, keep='first', inplace=True)
    df_clean.reset_index(drop=True, inplace=True)

# Tampilkan info
df_clean.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293795 entries, 0 to 293794
Data columns (total 26 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   No                          293794 non-null  float64       
 1   Tanggal Pengerjaan          293639 non-null  datetime64[ns]
 2   Checker                     293795 non-null  category      
 3   ASI/AFI                     293795 non-null  category      
 4   Waktu Inbound               293748 non-null  datetime64[ns]
 5   Nomor Inbound               293795 non-null  object        
 6   Nama Agent                  293795 non-null  object        
 7   Call ID                     293795 non-null  object        
 8   Durasi Bicara               293795 non-null  object        
 9   Detik                       293795 non-null  object        
 10  Total Tiket Riskan          293795 non-null  object        
 11  Apakah Riskan?              293795 non-

In [18]:
df_all['ASI/AFI'].value_counts()

ASI/AFI
ASI     178815
AFI     151925
ASI       4143
           421
Name: count, dtype: int64

In [19]:
df_all.to_csv('../../dataset_qc/all_qc.csv')