In [None]:
import pandas as pd
df = pd.read_parquet("/Users/yongjun/Documents/School/Y2S2/datathon/catB_train.parquet")

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Percentage of missing values in each column
missing_percentage = (missing_values / len(df)) * 100

# Display columns with a high percentage of missing values
columns_with_all_missing = missing_percentage[missing_percentage == 100].index

# Dropping columns with no values
df = df.drop(columns=columns_with_all_missing)
columns_with_high_missing = missing_percentage[missing_percentage > 50].index


In [None]:
from sklearn.impute import KNNImputer

# Create a KNN imputer with the desired number of neighbors (n_neighbors)
knn_imputer = KNNImputer(n_neighbors=5)
df[columns_with_high_missing] = knn_imputer.fit_transform(df[columns_with_high_missing])


In [None]:
# Calculating class ratios of each column 

target_column = 'stat_flag'

for col in df.columns:
    if col != target_column and df[col].dtype == 'object': 
        # Calculate class ratios
        class_ratios = df.groupby([col, target_column]).size().unstack(fill_value=0)
        class_ratios = class_ratios.div(class_ratios.sum(axis=1), axis=0)



In [None]:
# Calculating class ratios of each column 
target_column = 'stat_flag'

df = df.astype('category')

from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE

# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
encoder = OneHotEncoder()
df_encoded = pd.DataFrame(encoder.fit_transform(df[categorical_columns].astype(str)).toarray(), columns=encoder.get_feature_names_out(categorical_columns))

# Drop the original categorical columns and concatenate the encoded ones
df = pd.concat([df.drop(columns=categorical_columns), df_encoded], axis=1)


In [None]:
from imblearn.over_sampling import SMOTE

target_column = 'stat_flag'

# Check class ratios for the target column
class_ratios = df[target_column].value_counts(normalize=True)
print("Class Ratios:")
print(class_ratios)

# Smoting?
if class_ratios.min() < 0.1:
    minority_class = class_ratios.idxmin()
    minority_samples = df[df[target_column] == minority_class].drop(target_column, axis=1)

    smote = SMOTE(random_state=42)
    minority_samples_resampled, _ = smote.fit_resample(minority_samples, df[df[target_column] == minority_class][target_column])
    df.loc[df[target_column] == minority_class, minority_samples.columns] = minority_samples_resampled

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

x1 = df_encoded.drop(target_column, axis=1)
y1 = df_encoded[target_column]
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=42)

# Creating Decision Tree
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(x1_train, y1_train)
y1_pred = decision_tree_model.predict(x1_test)

# Evaluate the model
accuracy = accuracy_score(y1_test, y1_pred)
print(accuracy)

print(classification_report(y_test, y_pred))
