In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('hepatitis_csv.csv')
dataset=dataset.dropna()
dataset = dataset.drop(['sex', 'fatigue','liver_firm','antivirals'], axis=1)
df_encoded = pd.get_dummies(dataset, prefix_sep='_')
print(df_encoded.head())


    histology  age  bilirubin  alk_phosphate   sgot  albumin  protime  \
5       False   34        0.9           95.0   28.0      4.0     75.0   
10      False   39        1.3           78.0   30.0      4.4     85.0   
11      False   32        1.0           59.0  249.0      3.7     54.0   
12      False   41        0.9           81.0   60.0      3.9     52.0   
13      False   30        2.2           57.0  144.0      4.9     78.0   

    steroid_False  steroid_True  malaise_False  ...  spleen_palpable_False  \
5           False          True           True  ...                   True   
10           True         False           True  ...                   True   
11          False          True           True  ...                   True   
12          False          True           True  ...                   True   
13          False          True           True  ...                   True   

    spleen_palpable_True  spiders_False  spiders_True  ascites_False  \
5                  F

In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X)

[[False True False ... 28.0 4.0 75.0]
 [False False False ... 30.0 4.4 85.0]
 [False True False ... 249.0 3.7 54.0]
 ...
 [True False False ... 173.0 4.2 54.0]
 [True False False ... 19.0 4.1 48.0]
 [True True False ... 19.0 3.1 42.0]]


In [5]:
print(y)

['live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live'
 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'die' 'live'
 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live'
 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live'
 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live' 'live'
 'live' 'die' 'live' 'live' 'die' 'die' 'live' 'live' 'die' 'live' 'die'
 'die' 'live' 'live' 'live' 'live' 'die' 'live' 'die' 'live' 'live' 'die'
 'live' 'live' 'die' 'live' 'die' 'live' 'live' 'die']


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,2,3,4,5,6,7,8])], remainder='passthrough')

# X = np.array(ct.fit_transform(X))
    



df=dataset.drop(['class'],axis=1)
df_encoded = ct.fit_transform(X)
X=np.array(df_encoded)
columns = ct.get_feature_names_out(input_features=df.columns)
df_encoded = pd.DataFrame(df_encoded, columns=columns)
print(df_encoded.head())


  encoder__histology_False encoder__histology_True encoder__steroid_False  \
0                      1.0                     0.0                    0.0   
1                      1.0                     0.0                    1.0   
2                      1.0                     0.0                    0.0   
3                      1.0                     0.0                    0.0   
4                      1.0                     0.0                    0.0   

  encoder__steroid_True encoder__malaise_False encoder__malaise_True  \
0                   1.0                    1.0                   0.0   
1                   0.0                    1.0                   0.0   
2                   1.0                    1.0                   0.0   
3                   1.0                    1.0                   0.0   
4                   1.0                    1.0                   0.0   

  encoder__anorexia_False encoder__anorexia_True encoder__liver_big_False  \
0                     1.0  

In [7]:
print(X[0])

[1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0
 34 0.9 95.0 28.0 4.0 75.0]


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    

In [9]:
from sklearn.feature_selection import SelectKBest, chi2

# Create a SelectKBest object with chi-square test to select top k features, let's select all for illustration
kbest = SelectKBest(score_func=chi2, k='all')

# Fit and transform the data
X_best_features = kbest.fit_transform(X_train, y_train)


# Display scores for each feature
feature_scores = pd.DataFrame({
    'Feature': df_encoded.columns,
    'Score': kbest.scores_
}).sort_values(by='Score', ascending=False)


feature_scores['P-value'] = kbest.pvalues_
print(feature_scores)
feature_scores['Original_Feature'] = feature_scores['Feature'].apply(lambda x: x.split('_')[0])


                           Feature      Score       P-value
23              remainder__protime  94.254963  1.738994e-02
20        remainder__alk_phosphate  86.142131  4.036510e-03
18                  remainder__age   9.361081  2.827221e-01
15           encoder__ascites_True   9.308367  2.677171e-01
1          encoder__histology_True   8.267318  2.282392e-01
21                 remainder__sgot   7.218389  1.198203e-01
0         encoder__histology_False   5.656586  5.352861e-01
19            remainder__bilirubin   5.568887  1.496837e-01
13           encoder__spiders_True   5.156721  1.307971e-01
17           encoder__varices_True   3.240627  4.912286e-01
5            encoder__malaise_True   2.419668  5.122563e-01
8         encoder__liver_big_False   2.283019  1.942896e-01
12          encoder__spiders_False   2.177282  1.400616e-01
7           encoder__anorexia_True   2.075472  2.315657e-02
11   encoder__spleen_palpable_True   1.684787  2.171378e-01
14          encoder__ascites_False   1.5

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [12]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]]


In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 2  0]
 [ 2 12]]


0.875

In [14]:
from sklearn.model_selection import cross_validate

# Using cross_validate to get more detailed results
results = cross_validate(classifier, X_train, y_train, cv=5,
                         return_train_score=True, scoring=('accuracy', 'f1'))

# Display the results
print("Test set scores (Accuracy):", results['test_accuracy'])
print("Test set scores (F1):", results['test_f1'])
print("Average test accuracy: %0.2f" % results['test_accuracy'].mean())

Test set scores (Accuracy): [0.76923077 0.69230769 0.84615385 0.92307692 0.91666667]
Test set scores (F1): [0.85714286 0.81818182 0.91666667 0.95238095 0.95238095]
Average test accuracy: 0.83
