In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error

# Load the data
data = pd.read_csv('proj-data.csv')

# Data preprocessing
# Remove trailing colons from column names
data.columns = data.columns.str.rstrip(':').str.strip()

# Replace non-numeric values with NaN
data.replace('?', np.nan, inplace=True)

# Handle binary columns
binary_columns = ['TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 'TBG measured']
for col in binary_columns:
    data[col] = data[col].map({'f': 0, 't': 1})

# Convert appropriate columns to numeric
numeric_columns = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
                       'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 
                       'goitre', 'tumor', 'hypopituitary', 'psych', 'referral source', 'diagnoses']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column].astype(str))

# Preserve the record identification column
record_identification = data['[record identification]'].values

# Drop the record identification column from the main DataFrame
data.drop(columns=['[record identification]'], inplace=True)

# Ensure all data is numeric
#print(data.dtypes)

# Check for non-numeric data in the dataset
for col in data.columns:
    if data[col].dtype == 'object':
        print(f"Column {col} contains non-numeric values.")
        print(data[col].unique())

# Diagnose prediction model
X_diagnosis = data.drop(['diagnoses'], axis=1)
y_diagnosis = data['diagnoses']

# Split data into training and testing sets for diagnosis prediction
X_train_diag, X_test_diag, y_train_diag, y_test_diag = train_test_split(X_diagnosis, y_diagnosis, test_size=0.2, random_state=42)

# Model training for diagnosis prediction
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_diag, y_train_diag)

# Model evaluation for diagnosis prediction
y_pred_diag = rf_classifier.predict(X_test_diag)

# Retrieve original class labels for diagnoses
original_diagnoses_labels = label_encoders['diagnoses'].classes_

# Print class labels and their corresponding diagnoses
for class_label, diagnosis in enumerate(original_diagnoses_labels):
    print(f"Class {class_label}: {diagnosis}")

print(classification_report(y_test_diag, y_pred_diag))

# Feature importance for diagnosis prediction
importances_diag = rf_classifier.feature_importances_
feature_importance_df_diag = pd.DataFrame({'Feature': X_diagnosis.columns, 'Importance': importances_diag})
print(feature_importance_df_diag.sort_values(by='Importance', ascending=False))

# Age prediction model
X_age = data.drop(['age'], axis=1)
y_age = data['age']

# Split data into training and testing sets for age prediction
X_train_age, X_test_age, y_train_age, y_test_age, record_train, record_test = train_test_split(X_age, y_age, record_identification, test_size=0.2, random_state=42)

# Model training for age prediction
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_age, y_train_age)

# Age prediction evaluation
y_pred_age = rf_regressor.predict(X_test_age)

# Create a DataFrame to display the results with record identification
predictions_df = pd.DataFrame({'Record Identification': record_test, 'Actual Age': y_test_age, 'Predicted Age': y_pred_age})
print(predictions_df)
print('MSE:', mean_squared_error(y_test_age, y_pred_age))


Class 0: -
Class 1: A
Class 2: AK
Class 3: B
Class 4: C
Class 5: C|I
Class 6: D
Class 7: D|R
Class 8: E
Class 9: F
Class 10: FK
Class 11: G
Class 12: GI
Class 13: GK
Class 14: GKJ
Class 15: H|K
Class 16: I
Class 17: J
Class 18: K
Class 19: KJ
Class 20: L
Class 21: LJ
Class 22: M
Class 23: MI
Class 24: MK
Class 25: N
Class 26: O
Class 27: OI
Class 28: P
Class 29: Q
Class 30: R
Class 31: S
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1114
           1       0.75      0.71      0.73        21
           2       1.00      1.00      1.00         4
           3       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           9       0.95      1.00      0.98        40
          11       0.82      1.00      0.90        41
          12       0.00      0.00      0.00         1
          13       0.88      1.00      0.93         7
          15       0.00      0.00      0.00         1
          16  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


     Record Identification  Actual Age  Predicted Age
996            [841122082]        34.0      62.706865
736            [850912062]        58.0      57.530000
1244           [841127060]        41.0      37.470000
1181           [840823010]        28.0      66.330000
3393           [861121057]        24.0      52.720000
...                    ...         ...            ...
705            [850423010]        72.0      41.190000
5219           [860210066]        79.0      59.290000
5407           [861119045]        77.0      60.940000
4267           [850801049]        69.0      55.280000
57             [840903010]        69.0      40.440000

[1468 rows x 3 columns]
MSE: 3148451.5701136123
