In [1]:
!pip install category_encoders



In [2]:
import pandas as pd
import category_encoders as ce
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import joblib

In [3]:
def data_preparation(df, label_column=None, scale_numeric=True, encode_categoricals=True, test_size=0.2, random_state=42):

  df['Name'] = df['Name'].str.title()

  df = pd.get_dummies(df, columns=['Gender'])

  df = pd.get_dummies(df, columns=['Blood Type'])

  df = pd.get_dummies(df, columns=['Medical Condition'])

  df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], errors='coerce', dayfirst=True)

  df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], errors='coerce', dayfirst=True)

  df['Doctor'] = df['Doctor'].str.title()

  df['Hospital'] = df['Hospital'].str.title()

  df = pd.get_dummies(df, columns=['Insurance Provider'])

  df = pd.get_dummies(df, columns=['Admission Type'])

  df = pd.get_dummies(df, columns=['Medication'])

  df['Billing_Is_Negative'] = df['Billing Amount'] < 0
  df['Billing Amount'] = df['Billing Amount'].abs()

  df['Billing_Is_Negative'] = df['Billing Amount'] < 0

  df['Billing Amount'] = df['Billing Amount'].abs()

  df['Name'] = df['Name'].str.strip()
  df['Name'] = df['Name'].str.replace(r'\s+', '_', regex=True)

  df['Doctor'] = df['Doctor'].str.strip()
  df['Doctor'] = df['Doctor'].str.replace(r'\s+', '_', regex=True)

  df['Hospital'] = df['Hospital'].str.strip()
  df['Hospital'] = df['Hospital'].str.replace(r'\s+', '_', regex=True)

  scaler = MinMaxScaler()

  billing_amount_values = df['Billing Amount'].values.reshape(-1, 1)

  normalized_billing_amount = scaler.fit_transform(billing_amount_values)

  df['Billing Amount'] = normalized_billing_amount

  scaler = MinMaxScaler()

  age_values = df['Age'].values.reshape(-1, 1)

  normalized_age = scaler.fit_transform(age_values)

  df['Age'] = normalized_age

  df = df.drop('Name', axis=1)

  df = df.drop('Room Number', axis=1)

  df['Days Spent'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

  df = df.drop('Discharge Date', axis=1)

  df = df.drop('Date of Admission', axis=1)

  df = df.drop('Billing_Is_Negative', axis=1)

  scaler = MinMaxScaler()

  days_spent_values = df['Days Spent'].values.reshape(-1, 1)

  normalized_days_spent = scaler.fit_transform(days_spent_values)

  df['Days Spent'] = normalized_days_spent

  df = df.drop('Doctor', axis=1)

  df = df.drop('Hospital', axis=1)

  test_results_df = df[['ID']].copy()

  df = df.copy()
  df[df.select_dtypes(include='bool').columns] = df.select_dtypes(include='bool').astype(int)

  pca = PCA(n_components=2)
  pca_result = pca.fit_transform(df)


  pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])

  merged_df = pd.concat([pca_df, test_results_df.reset_index(drop=True)], axis=1)

  return merged_df

In [4]:
df = pd.read_csv('/content/test data.csv')

pca_df = data_preparation(df)

In [5]:
pca_df.head()

Unnamed: 0,PC1,PC2,ID
0,-2749.499979,0.878682,50001
1,-2748.500009,0.557325,50002
2,-2747.500021,-0.967186,50003
3,-2746.499999,0.56209,50004
4,-2745.499983,1.17769,50005


In [6]:
model = joblib.load('/content/best_xgboost_model.pkl')

In [7]:
feature_columns = ['PC1', 'PC2']
X = pca_df[feature_columns]

In [8]:
X.head()

Unnamed: 0,PC1,PC2
0,-2749.499979,0.878682
1,-2748.500009,0.557325
2,-2747.500021,-0.967186
3,-2746.499999,0.56209
4,-2745.499983,1.17769


In [9]:
predictions = model.predict(X)

In [10]:
pca_df['Test Results'] = predictions

In [11]:
pca_df.head()

Unnamed: 0,PC1,PC2,ID,Test Results
0,-2749.499979,0.878682,50001,0
1,-2748.500009,0.557325,50002,0
2,-2747.500021,-0.967186,50003,0
3,-2746.499999,0.56209,50004,0
4,-2745.499983,1.17769,50005,0


In [12]:
submission_df = pca_df[['ID', 'Test Results']]

In [14]:
submission_df['Test Results'] = submission_df['Test Results'].replace({
    0: 'Normal',
    1: 'Inconclusive',
    2: 'Abnormal'
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['Test Results'] = submission_df['Test Results'].replace({


In [15]:
submission_df.head()

Unnamed: 0,ID,Test Results
0,50001,Normal
1,50002,Normal
2,50003,Normal
3,50004,Normal
4,50005,Normal


In [16]:
submission_df.to_csv('submission.csv', index=False)