In [6]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

## Pipeline A

In [7]:
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("QC Chatbot ").worksheet("Sampling Bad Survey")
data = sheet.get_all_values()

df = pd.DataFrame(data[1:], columns=data[0])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17721 entries, 0 to 17720
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Checking Date            17721 non-null  object
 1   Conversation Start Time  17721 non-null  object
 2   Week                     17721 non-null  object
 3   Month                    17721 non-null  object
 4   QC Name                  17721 non-null  object
 5   Business Type            17721 non-null  object
 6   UID                      17721 non-null  object
 7   ID Chat                  17721 non-null  object
 8   Main Category            17721 non-null  object
 9   QC Result                17721 non-null  object
 10  Sub Category             17721 non-null  object
 11  Suggestion Rate          17721 non-null  object
 12  Type                     17721 non-null  object
 13  Category                 17721 non-null  object
 14  Code_lama                17721 non-nul

In [9]:
# Copy the main DF
df_clean = df.copy()

# Convert date column
date_cols = ['Checking Date', 'Conversation Start Time']
for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], format = "%m/%d/%Y", errors='coerce')

# Delete unnecessary columns
df_clean.drop(columns=['Code_lama'], inplace=True)

# Clean column Code
df_clean['Code'] = (
    df_clean['Code']
    .fillna('')
    .replace('-', '', regex=True)
)

# Convert rate to numeric
df_clean['Rating'] = pd.to_numeric(df_clean['Rating'], errors='coerce')

# Remove duplicate
df_clean.drop_duplicates(inplace=True)

# strip spasi untuk semua kolom object/string
df_clean = df_clean.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17718 entries, 0 to 17720
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Checking Date            17717 non-null  datetime64[ns]
 1   Conversation Start Time  17718 non-null  datetime64[ns]
 2   Week                     17718 non-null  object        
 3   Month                    17718 non-null  object        
 4   QC Name                  17718 non-null  object        
 5   Business Type            17718 non-null  object        
 6   UID                      17718 non-null  object        
 7   ID Chat                  17718 non-null  object        
 8   Main Category            17718 non-null  object        
 9   QC Result                17718 non-null  object        
 10  Sub Category             17718 non-null  object        
 11  Suggestion Rate          17718 non-null  object        
 12  Type                     17718 non-nu

In [10]:
df_clean.to_csv('../../dataset_kula/bad_survey.csv')