In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#data = pd.read_csv("play_tennis.csv")
data=pd.read_csv("DLBCL-2.csv")
#data = data.drop('day',axis=1)
X = data.drop(columns=['target'])
y = data['target']
data['target'] = data['target'].map({'DLBCL': 0, 'FL': 1})

data.head(15)

def entropy(y):
    value_counts = y.value_counts()
    probabilities = value_counts / len(y)
    entropy_value = -np.sum(probabilities * np.log2(probabilities.replace(0, 1)))
    return entropy_value
entropy(y)

def information_gain(y, feature):

    total_entropy = entropy(y)
    
    unique_values = feature.unique()
    weighted_entropies = 0

    for value in unique_values:
        subset_y = y[feature == value]
        weighted_entropies += (len(subset_y) / len(y)) * entropy(subset_y)

    return total_entropy - weighted_entropies

for column in data.columns:
    if column != 'target':
        feature = data[column]
        ig = information_gain(y, feature)
        print(f"Feature: {column}, Information Gain: {ig:.4f}")

class Node:
    def __init__(self, feature=None, value=None, entropy=None, information_gain=None, left=None, right=None):
        self.feature = feature
        self.value = value
        self.entropy = entropy
        self.information_gain = information_gain
        self.left = left
        self.right = right

def build_decision_tree(X, y):
    if entropy(y) == 0:
        # If all instances have the same class, create a leaf node
        return Node(value=y.iloc[0])

    if X.empty:
        # If no features left, create a leaf node with the majority class
        return Node(value=y.value_counts().idxmax())

    # Find the best feature to split on
    best_feature = None
    max_info_gain = 0

    for feature_name in X.columns:
        current_info_gain = information_gain(y, X[feature_name])
        if current_info_gain > max_info_gain:
            max_info_gain = current_info_gain
            best_feature = feature_name

    # Create a node with the best feature
    node = Node(feature=best_feature, entropy=entropy(y), information_gain=max_info_gain, value={})

    # Recursively build the left and right subtrees
    unique_values = X[best_feature].unique()
    for value in unique_values:
        subset_X = X[X[best_feature] == value].drop(columns=[best_feature])
        subset_y = y[X[best_feature] == value]
        child_node = build_decision_tree(subset_X, subset_y)

        if node.value is None:
            node.value = {value: child_node}
        else:
            node.value[value] = child_node

    return node

decision_tree = build_decision_tree(X, y)


def predict(node, instance):
  if node.feature is None:
    return node.value  # Assuming 'value' holds the final class label here
  else:
    value = instance[node.feature]
    if value in node.value:
      return predict(node.value[value], instance)
    else:
      # Handle unseen feature values (return default class)
      return y_train.value_counts().idxmax()  # Assuming 'y_train' holds the training target variable


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

decision_tree = build_decision_tree(X_train, y_train)
y_pred = [predict(decision_tree, instance) for _, instance in X_test.iterrows()]

accuracy_score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score:.2f}")

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
tp=cm[0][0]
fp=cm[0][1]
fn=cm[1][0]
tn=cm[1][1]
print (tp,fp,fn,tn)
acc=(tp+tn)/(tp+tn+fp+fn)
print (acc)

Feature: 200754_x_at, Information Gain: 0.9353
Feature: 217879_at, Information Gain: 0.9353
Feature: 210616_s_at, Information Gain: 0.9353
Feature: 212718_at, Information Gain: 0.9353
Feature: 209140_x_at, Information Gain: 0.9353
Feature: 209140_x_at.1, Information Gain: 0.9353
Feature: 222607_s_at, Information Gain: 0.9353
Feature: 216526_x_at, Information Gain: 0.9353
Feature: 217913_at, Information Gain: 0.9353
Feature: 1554078_s_at, Information Gain: 0.9353
Feature: 212870_at, Information Gain: 0.9353
Feature: 212239_at, Information Gain: 0.9353
Accuracy: 0.65
15 0 8 0
0.6521739130434783


In [23]:
y_test

78     1
10     0
4      0
84     1
64     0
68     0
30     0
45     0
96     1
11     0
79     1
80     1
0      0
81     1
18     0
70     0
56     0
72     1
109    1
42     0
12     0
36     0
65     0
Name: target, dtype: int64

In [24]:
y_train

26     0
22     0
31     0
47     0
76     1
      ..
106    1
14     0
92     1
51     0
102    1
Name: target, Length: 88, dtype: int64

In [25]:
y_pred

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
data.head()

Unnamed: 0,200754_x_at,217879_at,210616_s_at,212718_at,209140_x_at,209140_x_at.1,222607_s_at,216526_x_at,217913_at,1554078_s_at,212870_at,212239_at,target
0,13,10,11,13,15,15,9,15,10,11,10,10,
1,13,10,11,12,14,14,10,14,10,11,10,10,
2,13,10,11,12,15,15,10,15,10,10,10,10,
3,13,10,11,12,14,14,10,14,10,10,10,10,
4,13,10,11,12,15,15,10,15,9,10,10,9,


In [2]:
import pandas as pd


In [13]:
data=pd.read_csv("DLBCL-2.csv")

In [14]:
data.head()

Unnamed: 0,200754_x_at,217879_at,210616_s_at,212718_at,209140_x_at,209140_x_at.1,222607_s_at,216526_x_at,217913_at,1554078_s_at,212870_at,212239_at,target
0,13.178,10.223,11.207,13.433,15.752,15.752,9.707,15.517,10.525,11.283,10.691,10.267,0
1,13.457,10.815,11.597,12.175,14.721,14.721,10.116,14.705,10.549,11.1,10.34,10.535,0
2,13.297,10.134,11.051,12.37,15.206,15.206,10.738,15.441,10.333,10.768,10.59,10.897,0
3,13.695,10.394,11.005,12.796,14.832,14.832,10.713,14.917,10.293,10.941,10.365,10.292,0
4,13.711,10.106,11.357,12.427,15.296,15.296,10.069,15.412,9.961,10.317,10.463,9.941,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   200754_x_at    111 non-null    float64
 1   217879_at      111 non-null    float64
 2   210616_s_at    111 non-null    float64
 3   212718_at      111 non-null    float64
 4   209140_x_at    111 non-null    float64
 5   209140_x_at.1  111 non-null    float64
 6   222607_s_at    111 non-null    float64
 7   216526_x_at    111 non-null    float64
 8   217913_at      111 non-null    float64
 9   1554078_s_at   111 non-null    float64
 10  212870_at      111 non-null    float64
 11  212239_at      111 non-null    float64
 12  target         111 non-null    object 
dtypes: float64(12), object(1)
memory usage: 10.9+ KB


In [13]:
data.tail(30)

Unnamed: 0,200754_x_at,217879_at,210616_s_at,212718_at,209140_x_at,209140_x_at.1,222607_s_at,216526_x_at,217913_at,1554078_s_at,212870_at,212239_at,target
81,7079.1,828.8,3085.5,7607.0,17783.5,17783.5,895.4,23226.2,1768.3,1592.8,1293.4,1106.8,1
82,9168.7,930.8,2092.6,9216.4,39741.5,39741.5,1088.5,44126.4,1322.4,1661.1,2075.3,2012.5,1
83,8855.4,1546.9,1154.6,8543.2,41566.4,41566.4,1499.7,41570.3,968.5,2219.9,2093.6,1941.3,1
84,8807.4,1064.7,2479.5,10767.8,31492.6,31492.6,1079.6,34430.1,1208.9,1891.6,2068.5,1678.1,1
85,8587.2,1261.5,2166.7,7929.3,31494.1,31494.1,1144.6,32345.8,1149.4,1680.7,1835.1,1988.3,1
86,10120.4,1016.3,2229.8,8238.5,31373.4,31373.4,1325.3,33882.6,1062.8,1780.0,2424.7,1901.7,1
87,10801.3,1217.2,1704.3,8704.9,30688.4,30688.4,1767.6,32040.2,1263.4,1960.2,2062.8,1664.3,1
88,11333.8,1381.9,1836.6,10215.9,31910.8,31910.8,1208.1,33196.8,1149.5,1856.4,1356.3,1795.1,1
89,11942.6,1353.0,1849.8,7701.4,37913.4,37913.4,1233.4,38911.5,1211.6,1732.0,1303.2,1881.9,1
90,10566.8,1291.0,1855.6,8676.5,31697.9,31697.9,1340.8,34818.6,1000.1,1540.8,1587.0,2218.1,1


In [12]:
target='target'
unique_values = df[target].unique()

# Print the unique values
print(unique_values)

['DLBCL' 'FL']


In [20]:
df=pd.read_csv("Financial_Coverage-2.csv")

In [21]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
