In [12]:
import streamlit as st
import pandas as pd
import numpy as np
import gspread
import plotly.express as px
import plotly.graph_objects as go

from datetime import timedelta
from google.oauth2.service_account import Credentials
from collections import defaultdict

In [13]:
# Read data from gsheet

SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']
creds = Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("[3] AI QC Inbound CRM Review 语音智能质检打标复审 ").worksheet("NEW 4")
data = sheet.get_all_values()

raw_headers = data[2]
rows = data[:3]
header_counter = defaultdict(int)
unique_header = []

for header in raw_headers:
    base = header if header.strip() != '' else 'Unnamed'
    header_counter[base] += 1

    if header_counter[base] == 1:
        unique_header.append(base)
    else:
        unique_header.append(f"{base}_{header_counter}[base]-1")

df = pd.DataFrame(data[1:], columns=unique_header)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146408 entries, 0 to 146407
Data columns (total 23 columns):
 #   Column                                                                                                                                                                                                                                                                                                                                                                                          Non-Null Count   Dtype 
---  ------                                                                                                                                                                                                                                                                                                                                                                                          --------------   ----- 
 0   No                                                        

In [15]:
# Data Cleaning
df_clean = df.copy()

# Hapus 2 baris awal
df_clean = df_clean.iloc[2:].reset_index(drop=True)

# Bersihkan nama kolom dan ganti yang kosong
df_clean.columns = [col.strip() if col.strip() else 'Catatan Tambahan' for col in df_clean.columns]

# Konversi kolom datetime jika ada
for col in ['Tanggal Pengerjaan', 'Waktu Inbound', 'Tanggal Sampling']:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Konversi kolom numerik
if 'No' in df_clean.columns:
    df_clean['No'] = pd.to_numeric(df_clean['No'], errors='coerce')

# Bersihkan kolom teks
text_columns = df_clean.select_dtypes(include='object').columns
df_clean[text_columns] = df_clean[text_columns].apply(lambda s: s.astype(str).str.strip())
df_clean.replace('', 'No Data', inplace=True)

# Konversi kolom kategorikal jika ada
categorical_cols = [
    'Checker', 'ASI/AFI', 'Status',
    'Efektif', 'Kejelasan Suara', 'Suara Lain',
    'Kelengkapan Rekaman', 'Sampling user side',
    'Hasil ASR', 'Hasil Pemeriksaan Kualitas', 'Agent Sampling'
]
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

# Hapus kolom 'Status' jika ada
df_clean.drop(columns=['Status'], errors='ignore', inplace=True)

# Tampilkan info
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146406 entries, 0 to 146405
Data columns (total 22 columns):
 #   Column                                                                                                                                                                                                                                                                                                                                                                                          Non-Null Count   Dtype         
---  ------                                                                                                                                                                                                                                                                                                                                                                                          --------------   -----         
 0   No                                        

In [16]:
df_clean

Unnamed: 0,No,Tanggal Pengerjaan,Checker,ASI/AFI,Waktu Inbound,Nomor Inbound,Nama Agent,Call ID,Durasi Bicara,Detik,...,Tinjauan CS,Sampling user side,Hasil Pemeriksaan Kualitas,Hasil ASR,"Hasil Pemeriksaan Kualitas_defaultdict(<class 'int'>, {'No': 1, 'Tanggal Pengerjaan': 1, 'Checker': 1, 'ASI/AFI': 1, 'Waktu Inbound': 1, 'Nomor Inbound': 1, 'Nama Agent': 1, 'Call ID ': 1, 'Durasi Bicara': 1, 'Detik ': 1, 'Total Tiket Riskan': 1, 'Apakah Riskan?': 1, 'Tinjauan CS': 1, 'Status': 1, 'Sampling user side': 1, 'Hasil Pemeriksaan Kualitas': 2, 'Hasil ASR ': 1})[base]-1",Efektif,Kejelasan Suara,Suara Lain,Kelengkapan Rekaman,Agent Sampling
0,1,2025-03-10,Azer,ASI,2025-03-07 13:06:19,0812****7814,TCHL_Anindyka Alfisyah,09a7b53c48c7419ab54ca4b412cb6257,0:26:27,2025-03-07 13:06:54,...,No Data,Done,No Data,Entri Akurat,Percakapan Normal,Miss Target/ Not HC,"Sangat jelas, tidak bising sama sekali",0 Satu pembicara,0 Utuh,No Data
1,2,2025-03-10,Azer,ASI,2025-03-07 13:06:19,0812****7814,TCHL_Anindyka Alfisyah,09a7b53c48c7419ab54ca4b412cb6257,0:26:27,2025-03-07 13:07:08,...,No Data,Done,No Data,Entri Akurat,Percakapan Normal,Miss Target/ Not HC,"Sangat jelas, tidak bising sama sekali",0 Satu pembicara,0 Utuh,No Data
2,3,2025-03-10,Azer,ASI,2025-03-07 13:06:19,0812****7814,TCHL_Anindyka Alfisyah,09a7b53c48c7419ab54ca4b412cb6257,0:26:27,2025-03-07 13:07:13,...,No Data,Done,No Data,Terdapat kesalahan,Percakapan Normal,Miss Target/ Not HC,Cukup Jelas,1 Suara tumpang tindih,1 Tidak Utuh,No Data
3,4,2025-03-10,Azer,ASI,2025-03-07 13:06:19,0812****7814,TCHL_Anindyka Alfisyah,09a7b53c48c7419ab54ca4b412cb6257,0:26:27,2025-03-07 13:07:20,...,No Data,Done,No Data,Terdapat kesalahan,Percakapan Normal,Miss Target/ Not HC,Cukup Jelas,0 Satu pembicara,0 Utuh,No Data
4,5,2025-03-10,Azer,ASI,2025-03-07 13:06:19,0812****7814,TCHL_Anindyka Alfisyah,09a7b53c48c7419ab54ca4b412cb6257,0:26:27,2025-03-07 13:07:26,...,No Data,Done,No Data,Terdapat kesalahan,Percakapan Normal,Miss Target/ Not HC,Cukup Jelas,1 Suara tumpang tindih,0 Utuh,No Data
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146401,146402,NaT,No Data,No Data,NaT,No Data,No Data,No Data,No Data,No Data,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data
146402,146403,NaT,No Data,No Data,NaT,No Data,No Data,No Data,No Data,No Data,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data
146403,146404,NaT,No Data,No Data,NaT,No Data,No Data,No Data,No Data,No Data,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data
146404,146405,NaT,No Data,No Data,NaT,No Data,No Data,No Data,No Data,No Data,...,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data,No Data


In [17]:
df_clean.to_csv('../../dataset_qc/new_4_clean.csv')