In [1]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

In [2]:
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("[3] AI QC Inbound CRM Review 语音智能质检打标复审 ").worksheet("Sampling")
data = sheet.get_all_values()

df = pd.DataFrame(data[1:], columns=data[0])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25240 entries, 0 to 25239
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   NO                          25240 non-null  object
 1   Tanggal Sampling            25240 non-null  object
 2   Agent Sampling              25240 non-null  object
 3   Tanggal Pengerjaan          25240 non-null  object
 4   Checker                     25240 non-null  object
 5   ASI/AFI                     25240 non-null  object
 6   Waktu Inbound               25240 non-null  object
 7   Nomor Inbound               25240 non-null  object
 8   Nama Agent                  25240 non-null  object
 9   Call ID                     25240 non-null  object
 10  Durasi Bicara               25240 non-null  object
 11  Detik                       25240 non-null  object
 12  Total Tiket Riskan          25240 non-null  object
 13  Apakah Riskan?              25240 non-null  ob

In [4]:
# Data Cleaning
df_clean = df.copy()

# Bersihkan nama kolom
df_clean.columns = [
    col.strip() if col.strip() else 'Catatan Tambahan'
    for col in df_clean.columns
]
df_clean.columns = df_clean.columns.str.strip()

# Konversi kolom tanggal
date_cols = ['Tanggal Pengerjaan', 'Waktu Inbound']
if 'Tanggal Sampling' in df_clean.columns:
    date_cols.append('Tanggal Sampling')

for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Konversi kolom numerik
if 'NO' in df_clean.columns:
    df_clean['NO'] = pd.to_numeric(df_clean['NO'], errors='coerce')

# Bersihkan kolom teks
text_cols = df_clean.select_dtypes(include='object').columns
df_clean[text_cols] = df_clean[text_cols].apply(lambda x: x.astype(str).str.strip())

# Ganti string kosong dengan 'No Data'
df_clean.replace('', 'No Data', inplace=True)

# Konversi kolom kategori
categorical_cols = [
    'Checker', 'ASI/AFI', 'Status',
    'Efektif', 'Kejelasan Suara', 'Suara Lain',
    'Kelengkapan Rekaman', 'Sampling user side', 
    'Hasil ASR', 'Hasil Pemeriksaan Kualitas', 'Agent Sampling'
]
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

# Drop kolom tidak diperlukan 
if 'Status' in df_clean.columns:
    df_clean.drop(columns='Status', inplace=True)

# Info dataframe
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25240 entries, 0 to 25239
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   NO                          25240 non-null  int64         
 1   Tanggal Sampling            25208 non-null  datetime64[ns]
 2   Agent Sampling              25240 non-null  category      
 3   Tanggal Pengerjaan          25208 non-null  datetime64[ns]
 4   Checker                     25240 non-null  category      
 5   ASI/AFI                     25240 non-null  category      
 6   Waktu Inbound               25208 non-null  datetime64[ns]
 7   Nomor Inbound               25240 non-null  object        
 8   Nama Agent                  25240 non-null  object        
 9   Call ID                     25240 non-null  object        
 10  Durasi Bicara               25240 non-null  object        
 11  Detik                       25240 non-null  object    

In [5]:
df_clean.to_csv("../../dataset_qc/kalib_sampling.csv")