In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error

# Load the data
data = pd.read_csv('proj-data.csv')

# Data preprocessing
# Remove trailing colons from column names
data.columns = data.columns.str.rstrip(':').str.strip()

#print(data.columns)

# Replace non-numeric values with NaN
data.replace('?', np.nan, inplace=True)

# Handle binary columns
binary_columns = ['TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 'TBG measured']
for col in binary_columns:
    data[col] = data[col].map({'f': 0, 't': 1})

# Convert appropriate columns to numeric
numeric_columns = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Encode categorical variables
label_encoders = {}
categorical_columns = ['sex', 'on thyroxine', 'query on thyroxine', 'on antithyroid medication', 'sick', 'pregnant',
                       'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 
                       'goitre', 'tumor', 'hypopituitary', 'psych', 'referral source', 'diagnoses']
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column].astype(str))

# Drop columns not needed for the model
data.drop(columns=['[record identification]'], inplace=True)

# Ensure all data is numeric
#print(data.dtypes)

# Check for non-numeric data in the dataset
for col in data.columns:
    if data[col].dtype == 'object':
        print(f"Column {col} contains non-numeric values.")
        print(data[col].unique())

# Split data into features and target
X = data.drop(['diagnoses'], axis=1)
y = data['diagnoses']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training for diagnose prediction
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Model evaluation for diagnose prediction
y_pred = rf_classifier.predict(X_test)

# Retrieve original class labels for diagnoses
original_diagnoses_labels = label_encoders['diagnoses'].classes_

# Print class labels and their corresponding diagnoses
for class_label, diagnosis in enumerate(original_diagnoses_labels):
    print(f"Class {class_label}: {diagnosis}")

print(classification_report(y_test, y_pred))

# Feature importance for diagnose prediction
importances = rf_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

# Regression for age prediction
X_age = data.drop(['age'], axis=1)
y_age = data['age']
X_train_age, X_test_age, y_train_age, y_test_age = train_test_split(X_age, y_age, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_age, y_train_age)

# Age prediction evaluation
y_pred_age = rf_regressor.predict(X_test_age)
predictions_df = pd.DataFrame({'Actual Age': y_test_age, 'Predicted Age': y_pred_age})
print(predictions_df)
print('MSE:', mean_squared_error(y_test_age, y_pred_age))


Class 0: -
Class 1: A
Class 2: AK
Class 3: B
Class 4: C
Class 5: C|I
Class 6: D
Class 7: D|R
Class 8: E
Class 9: F
Class 10: FK
Class 11: G
Class 12: GI
Class 13: GK
Class 14: GKJ
Class 15: H|K
Class 16: I
Class 17: J
Class 18: K
Class 19: KJ
Class 20: L
Class 21: LJ
Class 22: M
Class 23: MI
Class 24: MK
Class 25: N
Class 26: O
Class 27: OI
Class 28: P
Class 29: Q
Class 30: R
Class 31: S
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1114
           1       0.81      0.81      0.81        21
           2       1.00      1.00      1.00         4
           3       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           9       0.95      1.00      0.98        40
          11       0.89      1.00      0.94        41
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         7
          15       0.00      0.00      0.00         1
          16  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


      Actual Age  Predicted Age
996         34.0      63.892556
736         58.0      52.970000
1244        41.0      38.530000
1181        28.0      60.160000
3393        24.0      50.710000
...          ...            ...
705         72.0      37.990000
5219        79.0      58.750000
5407        77.0      58.600000
4267        69.0      53.190000
57          69.0      44.650000

[1468 rows x 2 columns]
MSE: 3237611.8962089363
