In [24]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [25]:

def loadAndPreProcess(csvFilePath):
    dataFrame = pd.read_csv(csvFilePath)
    
    # Drop 'sno' column
    dataFrame.drop('sno', axis=1, inplace=True)

    # Identify columns starting with 'A'
    a_columns = [col for col in dataFrame.columns if str(dataFrame[col][0]).startswith('A')]

    # One-hot encode 'poi' column
    encoder_poi = OneHotEncoder(drop='first', sparse=False)
    encoded_poi = encoder_poi.fit_transform(dataFrame[['poi']])
    encoded_poi_df = pd.DataFrame(encoded_poi, columns=encoder_poi.get_feature_names(['poi']), index=dataFrame.index)
    dataFrame.drop('poi', axis=1, inplace=True)
    dataFrame = pd.concat([dataFrame, encoded_poi_df], axis=1)

    # One-hot encode 'A' columns
    encoder_a = OneHotEncoder(drop='first', sparse=False)
    encoded_a = encoder_a.fit_transform(dataFrame[a_columns])
    encoded_a_df = pd.DataFrame(encoded_a, columns=encoder_a.get_feature_names(a_columns), index=dataFrame.index)
    dataFrame.drop(a_columns, axis=1, inplace=True)
    dataFrame = pd.concat([dataFrame, encoded_a_df], axis=1)

    # Find average age value and fill missing values
    avg_age = dataFrame['age'].mean()
    dataFrame['age'].fillna(avg_age, inplace=True)

    return dataFrame


In [26]:
trainData = loadAndPreProcess("data/train.csv")



In [27]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 54 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   duration_month         800 non-null    int64  
 1   resident_since         800 non-null    int64  
 2   age                    800 non-null    float64
 3   credits_no             800 non-null    int64  
 4   liables                800 non-null    int64  
 5   Group_no               800 non-null    int64  
 6   poi_2.0                800 non-null    float64
 7   poi_3.0                800 non-null    float64
 8   poi_4.0                800 non-null    float64
 9   poi_nan                800 non-null    float64
 10  acc_info_A12           800 non-null    float64
 11  acc_info_A13           800 non-null    float64
 12  acc_info_A14           800 non-null    float64
 13  credit_history_A31     800 non-null    float64
 14  credit_history_A32     800 non-null    float64
 15  credit

In [28]:
trainData.to_csv("preProcessedTrain.csv")

In [29]:
testData = loadAndPreProcess("data/test.csv")



In [30]:
testData.insert(39, 'gurantors_nan', 0)

In [31]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   duration_month         200 non-null    int64  
 1   resident_since         200 non-null    int64  
 2   age                    200 non-null    float64
 3   credits_no             200 non-null    int64  
 4   liables                200 non-null    int64  
 5   poi_2.0                200 non-null    float64
 6   poi_3.0                200 non-null    float64
 7   poi_4.0                200 non-null    float64
 8   poi_nan                200 non-null    float64
 9   acc_info_A12           200 non-null    float64
 10  acc_info_A13           200 non-null    float64
 11  acc_info_A14           200 non-null    float64
 12  credit_history_A31     200 non-null    float64
 13  credit_history_A32     200 non-null    float64
 14  credit_history_A33     200 non-null    float64
 15  credit

In [32]:
testClmns = list(testData.columns)
trainClmns = list(trainData.columns)

In [33]:
testClmns

['duration_month',
 'resident_since',
 'age',
 'credits_no',
 'liables',
 'poi_2.0',
 'poi_3.0',
 'poi_4.0',
 'poi_nan',
 'acc_info_A12',
 'acc_info_A13',
 'acc_info_A14',
 'credit_history_A31',
 'credit_history_A32',
 'credit_history_A33',
 'credit_history_A34',
 'purpose_A41',
 'purpose_A410',
 'purpose_A42',
 'purpose_A43',
 'purpose_A44',
 'purpose_A45',
 'purpose_A46',
 'purpose_A48',
 'purpose_A49',
 'savings_acc_A62',
 'savings_acc_A63',
 'savings_acc_A64',
 'savings_acc_A65',
 'employment_st_A72',
 'employment_st_A73',
 'employment_st_A74',
 'employment_st_A75',
 'employment_st_nan',
 'personal_status_A92',
 'personal_status_A93',
 'personal_status_A94',
 'gurantors_A102',
 'gurantors_A103',
 'gurantors_nan',
 'property_type_A122',
 'property_type_A123',
 'property_type_A124',
 'installment_type_A142',
 'installment_type_A143',
 'housing_type_A152',
 'housing_type_A153',
 'housing_type_nan',
 'job_type_A172',
 'job_type_A173',
 'job_type_A174',
 'telephone_A192',
 'foreigner_A2

In [34]:
testClmns == trainClmns

False

In [35]:
trainClmns.remove('Group_no')

In [36]:
mismatched_columns = [col for col in testClmns if col not in trainClmns]

if mismatched_columns:
    print("Mismatched columns:", mismatched_columns)
else:
    print("Both lists have the same order of columns.")

Both lists have the same order of columns.


In [37]:
trainData.to_csv("preProcessedTest.csv")