In [2]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

## Pipeline A

In [3]:
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("2025 Analyze (Like/Dislike)").worksheet("Raw Data [Statistic form] 2024")
data = sheet.get_all_values()

df = pd.DataFrame(data[1:], columns=data[0])

In [4]:
df_clean = df.copy()

# Hapus spasi berlebih
df_clean.columns = df_clean.columns.str.strip()
df_clean.columns = df_clean.columns.str.replace(r'\s+', ' ', regex=True)

# Change to datetime
datetime_col = [
    'Date'
]
for col in datetime_col:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Replacing the empty data with null
df_clean['business'].replace('', 'no data', inplace=True)

# Check is there any NA
if df_clean['business'].isna().any():
    df_clean['business'] = df_clean['business'].fillna('no data')

# Convert to categorycal data type
categorical_cols = [
    'business', 'Manual Check [business]', 'Team/Category', 'Week'
]
for col in categorical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype('category')

# Convert to numerical data type
numerical_cols = [
    'solved_num','unsolved_num'
]
for col in numerical_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['business'].replace('', 'no data', inplace=True)


In [5]:
df_clean

Unnamed: 0,Date,business,code_type,Code,solved_num,unsolved_num,Manual Check [business],Background detail,Team/Category,Week,Unnamed: 11
0,2024-01-01,no data,模型匹配,200101,17,23,AFI/ASI,chit-chat not identifikasi,No Category,,
1,2024-01-01,AFI,模型匹配,200101,8,36,AFI/ASI,chit-chat not identifikasi,No Category,,
2,2024-01-01,no data,高频FAQ,1344-1365-1427-278,4,6,ASI,Phone balance has not been received,Voucher,,
3,2024-01-01,ASI,模型匹配,1344-1365-1427-278,1,5,ASI,Phone balance has not been received,Voucher,,
4,2024-01-01,ASI,高频FAQ,1344-1365-1427-278,0,10,ASI,Phone balance has not been received,Voucher,,
...,...,...,...,...,...,...,...,...,...,...,...
159659,2025-08-31,4,模型匹配,872-957-1052-821,0,1,AFI,Bill payment before maturity,Akulaku Paylater,W4 Aug 2025,
159660,2025-08-31,no data,模型匹配,872-964-1059-886,0,1,AFI,Constraints to Pay Installments in Indomaret C...,Installment/Payment Gateway,W4 Aug 2025,
159661,2025-08-31,AFI,模型匹配,872-964-1059-886,0,1,AFI,Constraints to Pay Installments in Indomaret C...,Installment/Payment Gateway,W4 Aug 2025,
159662,2025-08-31,AFI,模型匹配,872-966-1061-888,0,1,AFI,Constraints to Pay down payment on Indomaret C...,Installment/Payment Gateway,W4 Aug 2025,


In [6]:
df_clean.to_csv('../../dataset_kula/kula_like_dislike.csv')