In [3]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

In [4]:
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("[3] AI QC Inbound CRM Review 语音智能质检打标复审 ").worksheet("Sampling")
data = sheet.get_all_values()

df = pd.DataFrame(data[1:], columns=data[0])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26264 entries, 0 to 26263
Data columns (total 33 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   NO                                26264 non-null  object
 1   Tanggal Sampling                  26264 non-null  object
 2   Agent Sampling                    26264 non-null  object
 3   Tanggal Pengerjaan                26264 non-null  object
 4   Checker                           26264 non-null  object
 5   ASI/AFI                           26264 non-null  object
 6   Waktu Inbound                     26264 non-null  object
 7   Nomor Inbound                     26264 non-null  object
 8   Nama Agent                        26264 non-null  object
 9   Call ID                           26264 non-null  object
 10  Durasi Bicara                     26264 non-null  object
 11  Detik                             26264 non-null  object
 12  Total Tiket Riskan

In [6]:
# Data Cleaning
df_clean = df.copy()

# Bersihkan nama kolom
df_clean.columns = [
    col.strip() if col.strip() else 'Catatan Tambahan'
    for col in df_clean.columns
]
df_clean.columns = df_clean.columns.str.strip()

# Penyesuaian nama checker
df_clean['Checker'] = df_clean['Checker'].replace({
    "Aul": "Aulia",
    "Iman": "Irman",
})

# Konversi kolom tanggal
date_cols = ['Tanggal Sampling','Tanggal Pengerjaan', 'Waktu Inbound']
for col in date_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Konversi kolom numerik
numeric_cols = ['NO', 'Count Hasil ASR', 'Count Hasil Pemeriksaan Kualitas', 'Count Efektif', 'Count Kejelasan Suara', 'Count Suara Lain', 'Count Kelengkapan Rekaman', 'Count Revisi Text']
for col in numeric_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].replace('No Data', 0)
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').fillna(0).astype(int)

# Bersihkan kolom teks
text_cols = df_clean.select_dtypes(include='object').columns
df_clean[text_cols] = df_clean[text_cols].apply(lambda x: x.astype(str).str.strip())

# Ganti string kosong dengan 'No Data'
df_clean.replace('', 'No Data', inplace=True)

# Konversi kolom kategori
categorical_cols = [
    'Checker', 'ASI/AFI', 'Status',
    'Efektif', 'Kejelasan Suara', 'Suara Lain',
    'Kelengkapan Rekaman', 'Sampling user side', 
    'Hasil ASR', 'Hasil Pemeriksaan Kualitas', 'Agent Sampling'
]
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

# Drop kolom tidak diperlukan 
if 'Status' in df_clean.columns:
    df_clean.drop(columns='Status', inplace=True)

# Info dataframe
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26264 entries, 0 to 26263
Data columns (total 32 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   NO                                26264 non-null  int64         
 1   Tanggal Sampling                  26239 non-null  datetime64[ns]
 2   Agent Sampling                    26264 non-null  category      
 3   Tanggal Pengerjaan                26239 non-null  datetime64[ns]
 4   Checker                           26264 non-null  category      
 5   ASI/AFI                           26264 non-null  category      
 6   Waktu Inbound                     26239 non-null  datetime64[ns]
 7   Nomor Inbound                     26264 non-null  object        
 8   Nama Agent                        26264 non-null  object        
 9   Call ID                           26264 non-null  object        
 10  Durasi Bicara                     26264 non-nu

In [7]:
df_clean

Unnamed: 0,NO,Tanggal Sampling,Agent Sampling,Tanggal Pengerjaan,Checker,ASI/AFI,Waktu Inbound,Nomor Inbound,Nama Agent,Call ID,...,Kelengkapan Rekaman,Revisi Teks,Red Label,Count Hasil ASR,Count Hasil Pemeriksaan Kualitas,Count Efektif,Count Kejelasan Suara,Count Suara Lain,Count Kelengkapan Rekaman,Count Revisi Text
0,1,2025-03-11,Aulia,2025-03-03,Azer,AFI,2025-02-28 10:49:24,0812****5797,Naella Absoni,770789e745fb4182853763cd8d1b6a40,...,0 Utuh,No Data,No Data,0,0,0,0,0,0,0
1,2,2025-03-11,Aulia,2025-03-03,Azer,AFI,2025-02-28 10:49:24,0812****5797,Naella Absoni,770789e745fb4182853763cd8d1b6a40,...,0 Utuh,No Data,No Data,0,0,0,0,0,0,0
2,3,2025-03-11,Aulia,2025-03-03,Azer,AFI,2025-02-28 10:49:24,0812****5797,Naella Absoni,770789e745fb4182853763cd8d1b6a40,...,1 Tidak Utuh,No Data,No Data,0,0,0,0,0,0,0
3,4,2025-03-11,Aulia,2025-03-03,Azer,AFI,2025-02-28 10:49:24,0812****5797,Naella Absoni,770789e745fb4182853763cd8d1b6a40,...,1 Tidak Utuh,No Data,No Data,0,0,0,0,0,0,0
4,5,2025-03-11,Aulia,2025-03-03,Azer,AFI,2025-02-28 10:49:24,0812****5797,Naella Absoni,770789e745fb4182853763cd8d1b6a40,...,0 Utuh,No Data,No Data,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26259,26260,NaT,No Data,NaT,No Data,No Data,NaT,No Data,No Data,No Data,...,No Data,No Data,No Data,0,0,0,0,0,0,0
26260,26261,NaT,No Data,NaT,No Data,No Data,NaT,No Data,No Data,No Data,...,No Data,No Data,No Data,0,0,0,0,0,0,0
26261,26262,NaT,No Data,NaT,No Data,No Data,NaT,No Data,No Data,No Data,...,No Data,No Data,No Data,0,0,0,0,0,0,0
26262,26263,NaT,No Data,NaT,No Data,No Data,NaT,No Data,No Data,No Data,...,No Data,No Data,No Data,0,0,0,0,0,0,0


In [8]:
df_clean['Checker'].unique()

['Azer', 'Neneng', 'Reza', 'Irman', 'Aulia', 'No Data']
Categories (6, object): ['Aulia', 'Azer', 'Irman', 'Neneng', 'No Data', 'Reza']

In [9]:
df_clean.to_csv("../../dataset_qc/kalib_sampling.csv")