In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)

In [3]:
# Preprocess the data
df2 = dataset1
df2 = pd.get_dummies(df2, drop_first=True)
indep_x = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

In [4]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(indep_x, dep_Y, test_size=0.25, random_state=0)


In [5]:
# Scale the features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# Train a Random Forest model
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)


In [7]:
# Make predictions
y_pred = classifier.predict(X_test)

In [8]:
# Calculate accuracy
rf_accuracy = accuracy_score(y_test, y_pred)
rf_accuracy

0.99

In [9]:
# Feature importance
feature_importance = classifier.feature_importances_
final_df2 = pd.DataFrame({'Features': indep_x.columns, 'Importance scores': feature_importance})
final_df2 = final_df2.sort_values(by='Importance scores', ascending=False)
final_df2

Unnamed: 0,Features,Importance scores
10,pcv,0.336585
9,hrmo,0.151938
12,rc,0.10564
2,al,0.081954
6,sc,0.046408
4,bgr,0.045056
7,sod,0.043095
5,bu,0.039401
21,htn_yes,0.02722
14,sg_c,0.019915


In [10]:
# Select top k features using SelectKBest
def selectkbest(indep_x, dep_y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit = test.fit(indep_x, dep_y)
    selected_features = fit.transform(indep_x)
    selected_columns = indep_x.columns[fit.get_support(indices=True)]  # Get selected feature names
    return selected_features, fit.scores_, selected_columns

In [11]:
kbest_features, scores, selected_columns = selectkbest(indep_x, dep_Y, 6)
selected_columns

Index(['al', 'bgr', 'bu', 'sc', 'pcv', 'wc'], dtype='object')

In [23]:
# Define the function to record Random Forest accuracy
def selectk_Classification(accrf):
    dataframe = pd.DataFrame(index=['ChiSquare'], columns=['Random'])
    dataframe.loc['ChiSquare', 'Random'] = accrf[-1]  # Add the latest accuracy
    return dataframe

In [24]:
# Accuracy tracking
accrf = [rf_accuracy]  # Store Random Forest accuracy
result = selectk_Classification(accrf)

In [25]:
# Display the result
result

Unnamed: 0,Random
ChiSquare,0.99
