In [1]:
import gspread
import pandas as pd
from google.oauth2.service_account import Credentials

## Pipeline A

In [2]:
SERVICE_ACCOUNT_FILE = '../../key/credentials.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']

creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)
client = gspread.authorize(creds)

sheet = client.open("LEMBAR KERJA - BAD SURVEY").worksheet("Sampling Bad Survey")
data = sheet.get_all_values()

df = pd.DataFrame(data[1:], columns=data[0])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18987 entries, 0 to 18986
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Checking Date            18987 non-null  object
 1   Conversation Start Time  18987 non-null  object
 2   Week                     18987 non-null  object
 3   Month                    18987 non-null  object
 4   QC Name                  18987 non-null  object
 5   Business Type            18987 non-null  object
 6   UID                      18987 non-null  object
 7   ID Chat                  18987 non-null  object
 8   Main Category            18987 non-null  object
 9   QC Result                18987 non-null  object
 10  Sub Category
(kolom R)   18987 non-null  object
 11  Suggestion Rate          18987 non-null  object
 12  Type                     18987 non-null  object
 13  Category
(kolom O)       18987 non-null  object
 14  Code_lama                18987 non-nul

In [4]:
# Copy the main DF
df_clean = df.copy()

# Clean column names
df_clean.columns = df_clean.columns.str.replace(r'\n\s*\(kolom\s*[A-Za-z]\)', '', regex=True, case=False)

# Convert date column
date_cols = ['Checking Date', 'Conversation Start Time']
for col in date_cols:
    df_clean[col] = pd.to_datetime(df_clean[col], format = "%m/%d/%Y",errors='coerce')

# Delete unnecessary columns
df_clean.drop(columns=['Code_lama'], inplace=True)

# Clean column Code
df_clean['Code'] = (
    df_clean['Code']
    .fillna('')
    .replace('-', '', regex=True)
)

# Convert rate to numeric
df_clean['Rating'] = pd.to_numeric(df_clean['Rating'], errors='coerce')

# Remove duplicate
df_clean.drop_duplicates(inplace=True)

# strip spasi untuk semua kolom object/string
df_clean = df_clean.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18984 entries, 0 to 18986
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Checking Date            18983 non-null  datetime64[ns]
 1   Conversation Start Time  18984 non-null  datetime64[ns]
 2   Week                     18984 non-null  object        
 3   Month                    18984 non-null  object        
 4   QC Name                  18984 non-null  object        
 5   Business Type            18984 non-null  object        
 6   UID                      18984 non-null  object        
 7   ID Chat                  18984 non-null  object        
 8   Main Category            18984 non-null  object        
 9   QC Result                18984 non-null  object        
 10  Sub Category             18984 non-null  object        
 11  Suggestion Rate          18984 non-null  object        
 12  Type                     18984 non-nu

In [5]:
df_clean.to_csv('../../dataset_kula/bad_survey.csv')