In [2]:
import pandas as pd

# Load the original dataset
data_path = '../Raw_Data/Raw_MC.csv'
data = pd.read_csv(data_path)

# Key columns needed for GBS and Rockall score calculations
key_columns = ['sbp', 'heartrate', 'melaena', 'comorbliver', 'comorbcv', 'transfused', 'age']

# Identify rows with at least one missing value in key columns
missing_data_rows = data[key_columns].isnull().any(axis=1)
num_missing_data_rows = missing_data_rows.sum()

print(f"Number of rows with at least one missing value in key columns: {num_missing_data_rows}")

# Remove rows with missing data in key columns
data_cleaned = data[~missing_data_rows]

# Display the number of rows before and after removing rows with missing data
print(f"Number of rows before cleaning: {data.shape[0]}")
print(f"Number of rows after cleaning: {data_cleaned.shape[0]}")

# Apply the functions to calculate GBS and Rockall scores
def calculate_gbs(row):
    gbs = 0
    if row['sbp'] >= 110:
        gbs += 0
    elif 100 <= row['sbp'] < 110:
        gbs += 1
    elif 90 <= row['sbp'] < 100:
        gbs += 2
    else:
        gbs += 3

    if row['heartrate'] >= 100:
        gbs += 1
    
    if row['melaena'] == 'Yes':
        gbs += 1

    if row['comorbliver'] == 'Yes':
        gbs += 2

    if row['comorbcv'] == 'Yes':
        gbs += 2

    if row['transfused'] == 'Yes':
        gbs += 6

    return gbs

def calculate_rockall(row):
    rockall = 0
    if row['age'] < 60:
        rockall += 0
    elif 60 <= row['age'] < 80:
        rockall += 1
    else:
        rockall += 2

    if row['sbp'] > 100 and row['heartrate'] < 100:
        rockall += 0
    elif row['heartrate'] >= 100:
        rockall += 1
    else:
        rockall += 2

    if row['comorbcv'] == 'Yes' or row['comorbliver'] == 'Yes':
        rockall += 2

    return rockall

# Calculate GBS and Rockall scores for the cleaned dataset
data_cleaned['GBS'] = data_cleaned.apply(calculate_gbs, axis=1)
data_cleaned['Rockall'] = data_cleaned.apply(calculate_rockall, axis=1)

# Save the cleaned dataset with scores to a new CSV file
output_path = '../Python scripts/HALT_score_included_cleaned.csv'
data_cleaned.to_csv(output_path, index=False)

# Verify the number of rows in the saved data
saved_data = pd.read_csv(output_path)
print(f"Number of rows in the saved data: {saved_data.shape[0]}")

# Verify that GBS and Rockall scores are present
print(saved_data[['patientid', 'GBS', 'Rockall']].head())

  data = pd.read_csv(data_path)


Number of rows with at least one missing value in key columns: 45
Number of rows before cleaning: 11974
Number of rows after cleaning: 11929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['GBS'] = data_cleaned.apply(calculate_gbs, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Rockall'] = data_cleaned.apply(calculate_rockall, axis=1)


Number of rows in the saved data: 11932
  patientid   GBS  Rockall
0       7.0   1.0      3.0
1       8.0  13.0      4.0
2      11.0  10.0      5.0
3      16.0   1.0      1.0
4      19.0   7.0      2.0


  saved_data = pd.read_csv(output_path)
