In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [2]:
import pandas as pd
import category_encoders as ce
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import joblib

In [7]:
def test_data_preparation(df):

    ids = df['ID']
    df = df.drop('ID', axis=1)
    # 1. Handle missing values (avoid inplace warnings, no duplicates)
    df['Blood Type'] = df['Blood Type'].fillna(df['Blood Type'].mode()[0])

    # 3. Convert dates and calculate 'Days Spent'
    df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce', dayfirst=True)
    df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce', dayfirst=True)
    df['Days Spent'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

    # 4. Drop unused columns
    df = df.drop(['Name', 'Room Number', 'Discharge Date', 'Date of Admission', 'Doctor', 'Hospital'], axis=1)

    # 5. One-hot encode categorical columns consistently
    categorical_columns = [
        'Gender', 'Blood Type', 'Medical Condition',
        'Insurance Provider', 'Admission Type', 'Medication'
    ]
    for col in categorical_columns:
        if col in df.columns:
            dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, dummies], axis=1)
            df = df.drop(col, axis=1)




    for col in ['Age', 'Billing Amount', 'Days Spent']:
        scaler = MinMaxScaler()
        df[[col]] = scaler.fit_transform(df[[col]])

    pca = PCA(n_components=25)
    df_pca = pd.DataFrame(pca.fit_transform(df), columns=[f'PC{i+1}' for i in range(25)])
    # 9. Add back ID column
    df_pca.insert(0, 'ID', ids.reset_index(drop=True))

    return df_pca


In [4]:
df = pd.read_csv('/content/test data.csv')

In [5]:
model = joblib.load('/content/best_logistic_regression_model_PCA.pkl')

In [8]:
X_test_ready = test_data_preparation(df)

In [9]:
X_test_ready.head()

Unnamed: 0,ID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25
0,50001,-0.84239,0.070283,0.778231,0.113366,-0.153847,-0.330826,0.931119,0.482921,0.008727,...,-0.062636,-0.093565,0.055984,-0.098126,-0.089663,0.06042,-0.017288,-0.168951,0.173877,-0.296375
1,50002,-0.532368,-0.984623,-0.882938,0.651582,0.746939,-0.375314,-0.48808,-0.175935,0.042659,...,0.009155,0.124123,0.175643,-0.075341,-0.050533,0.887313,-0.470317,-0.253954,0.324469,-0.253845
2,50003,0.987193,-0.801924,0.91577,0.582849,0.795288,0.080196,-0.627505,-0.139099,0.004383,...,-0.068004,-0.119943,-0.087928,-0.136803,-0.132473,-0.057407,-0.025441,0.281834,0.341154,0.149919
3,50004,-0.529211,-0.813349,0.074075,0.585796,0.37344,0.057765,0.543744,-0.218356,-0.264487,...,0.04472,-0.024292,0.015827,0.141636,0.909252,-0.135555,-0.146718,-0.222881,0.218772,-0.063907
4,50005,-1.149825,0.848222,-0.105608,-0.198715,0.017867,-0.257302,-0.275779,0.333962,-0.233033,...,-0.604297,-0.261543,-0.103897,-0.032165,0.002234,-0.106432,0.027825,0.069422,-0.368785,0.165875


In [10]:
ids = X_test_ready['ID']
X_test_ready = X_test_ready.drop('ID', axis=1)

In [11]:
ids.head()

Unnamed: 0,ID
0,50001
1,50002
2,50003
3,50004
4,50005


In [12]:
predictions = model.predict(X_test_ready)

In [13]:
X_test_ready.insert(0, 'ID', ids.reset_index(drop=True))

In [14]:
print(X_test_ready.columns)

Index(['ID', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9',
       'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18',
       'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25'],
      dtype='object')


In [15]:
print(predictions)

[0 1 2 ... 2 0 0]


In [16]:
X_test_ready['Test Results'] = predictions

In [17]:
X_test_ready.head()

Unnamed: 0,ID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,Test Results
0,50001,-0.84239,0.070283,0.778231,0.113366,-0.153847,-0.330826,0.931119,0.482921,0.008727,...,-0.093565,0.055984,-0.098126,-0.089663,0.06042,-0.017288,-0.168951,0.173877,-0.296375,0
1,50002,-0.532368,-0.984623,-0.882938,0.651582,0.746939,-0.375314,-0.48808,-0.175935,0.042659,...,0.124123,0.175643,-0.075341,-0.050533,0.887313,-0.470317,-0.253954,0.324469,-0.253845,1
2,50003,0.987193,-0.801924,0.91577,0.582849,0.795288,0.080196,-0.627505,-0.139099,0.004383,...,-0.119943,-0.087928,-0.136803,-0.132473,-0.057407,-0.025441,0.281834,0.341154,0.149919,2
3,50004,-0.529211,-0.813349,0.074075,0.585796,0.37344,0.057765,0.543744,-0.218356,-0.264487,...,-0.024292,0.015827,0.141636,0.909252,-0.135555,-0.146718,-0.222881,0.218772,-0.063907,1
4,50005,-1.149825,0.848222,-0.105608,-0.198715,0.017867,-0.257302,-0.275779,0.333962,-0.233033,...,-0.261543,-0.103897,-0.032165,0.002234,-0.106432,0.027825,0.069422,-0.368785,0.165875,0


In [18]:
submission_df = X_test_ready[['ID', 'Test Results']]

In [19]:
submission_df['Test Results'] = submission_df['Test Results'].replace({
    0: 'Normal',
    1: 'Inconclusive',
    2: 'Abnormal'
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['Test Results'] = submission_df['Test Results'].replace({


In [20]:
submission_df.head()

Unnamed: 0,ID,Test Results
0,50001,Normal
1,50002,Inconclusive
2,50003,Abnormal
3,50004,Inconclusive
4,50005,Normal


In [21]:
submission_df.to_csv('submission.csv', index=False)