In [1]:
import pandas as pd
import numpy as np

In [25]:
data_name = "credit"
input_name = "./" + data_name + ".csv"
key_name = data_name + "-cooked-key.csv"
output_name = "./" + data_name + "-cooked.csv"


In [None]:
df = pd.read_csv(input_name, header = None)
print("Dataset shape:", df.shape)

##### Rename columns as 1,2, ...

In [5]:
df.columns = np.arange(1,df.shape[1] + 1,1) #columns from 1 to 18

##### if applicable, lowercase each value in each column to make sure different categories with same meaning in a column due to case sensitivity are consistent

In [None]:
string_columns = df.select_dtypes(include=['object']).columns
df = df.applymap(lambda s: s.lower() if type(s) == str else s)


##### Check if the label has multi classes.

In [None]:
target_loc = df.shape[1]
df[target_loc].value_counts(normalize = True)

##### make the label binary by distinguishing majority class and others.

In [None]:
majority_class = df[target_loc].value_counts(normalize = True).head(1).index[0]
df[target_loc] = (df[target_loc] == majority_class).astype(int).astype(str)


##### check the number of unique values in each column.

In [11]:
count_1 =0
count_0 =0
count_3 =0
count_2 =0

for index, i in enumerate(df.nunique(axis = 0)):
    if i == 1:
        count_1 += 1
        print(index)
    if i==0:
        count_0 += 1
    if i>=3:
        count_3 += 1
    if i == 2:
        count_2 += 1



##### drop the columns with only one unique value

In [None]:
unique_counts = df.nunique(axis=0)
columns_to_drop = unique_counts[unique_counts == 1].index
df_cleaned = df.drop(columns=columns_to_drop)

##### give the range of encoded categorical features for each orginal categorical feature

In [None]:
running_count = 0
groups_string = ""


for i in df_cleaned.nunique(axis = 0):
    if i >= 3:
        groups_string = groups_string + str(running_count + 1) +"-"+str(running_count + i) +  ","
        running_count = (running_count + i)
    else:
        groups_string = groups_string + str(running_count + 1)+"-"+str(running_count + 1) + ","
        running_count = (running_count + 1)

text_file = open(key_name, "w")
n = text_file.write(groups_string[:-1])
text_file.close()

##### apply one-hot encoding for categorical features

In [None]:

df_cleaned.columns = np.arange(1, df_cleaned.shape[1]+ 1, 1)

count_total = 0
df_new = pd.DataFrame() #empty dataframe
# for col in range(1,len(df.columns) + 1): #iterate over every column
for col in df_cleaned.columns: #iterate over every column
    if df_cleaned[col].nunique() >= 3: #if there are more than 2 unique values
        df_new = pd.concat([df_new, pd.get_dummies(data = df_cleaned[col], dtype=int)], axis = 1) #standard one-hot encoding
        count_total = count_total + pd.get_dummies(data = df_cleaned[col], drop_first = False, dtype=int).shape[1]
    else: #means the original feature has 2 unique values, then we keep it as it is, but convert it to 0/1 structure
        df_new = pd.concat([df_new, pd.get_dummies(data = df_cleaned[col], drop_first = True, dtype=int)], axis = 1)
        count_total = count_total + pd.get_dummies(data = df_cleaned[col], drop_first = True, dtype=int).shape[1]



##### Rename columns, make values integer, and replace 0s with -1s for the linear model.

In [20]:
df_new.columns = np.arange(1, df_new.shape[1]+ 1, 1)
df_new.astype("int")
df_new= df_new.replace([0], -1)

In [25]:
df_new.to_csv(output_name, header=False, index = False)