In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

# Define column names
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
feature_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']

# Load data
df = pd.read_csv(r'C:\Users\bhanu\Downloads\pima-indians-diabetes.data.csv', names=column_names)
print("Original DataFrame:")
print(df.head())

# Convert dataframe to numpy array
X = df.iloc[:, 0:8]  # Features
Y = df.iloc[:, 8]    # Target variable

# Feature selection
selector = SelectKBest(score_func=chi2, k=5)
fit = selector.fit(X, Y)

# Summarize scores
np.set_printoptions(precision=3)
print("Feature scores:")
print(dict(zip(feature_names, fit.scores_)))

# Transform dataset
selected_features = fit.transform(X)
print("Transformed Feature Shape:", selected_features.shape)



Original DataFrame:
   preg  plas  pres  skin  test  mass   pedi  age  class
0     6   148    72    35     0  33.6  0.627   50      1
1     1    85    66    29     0  26.6  0.351   31      0
2     8   183    64     0     0  23.3  0.672   32      1
3     1    89    66    23    94  28.1  0.167   21      0
4     0   137    40    35   168  43.1  2.288   33      1
Feature scores:
{'preg': 111.51969063588255, 'plas': 1411.887040644141, 'pres': 17.605373215320718, 'skin': 53.10803983632434, 'test': 2175.5652729220137, 'mass': 127.66934333103643, 'pedi': 5.39268154697145, 'age': 181.30368904430023}
Transformed Feature Shape: (768, 5)


In [6]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Define column names
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load data
df = pd.read_csv(r'C:\Users\bhanu\Downloads\pima-indians-diabetes.data.csv', names=column_names)

# Convert dataframe to numpy array
X = df.iloc[:, 0:8]  # Features
Y = df.iloc[:, 8]    # Target variable

# Feature extraction
model = LogisticRegression(max_iter=400)

# Initialize RFE with the model and the number of features to select
rfe = RFE(estimator=model, n_features_to_select=3)

# Fit the RFE model
fit = rfe.fit(X, Y)

# Print selected features
print("Num Features: %d" % fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

Num Features: 3
Selected Features:  [ True False False False False  True  True False]
Feature Ranking:  [1 2 4 6 5 1 1 3]


In [7]:
fit.n_features_


3

In [8]:
fit.support_

array([ True, False, False, False, False,  True,  True, False])

In [9]:
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# Define column names
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

# Load data
df = pd.read_csv(r'C:\Users\bhanu\Downloads\pima-indians-diabetes.data.csv', names=column_names)

# Convert dataframe to numpy array
X = df.iloc[:, 0:8]  # Features
Y = df.iloc[:, 8]    # Target variable

# Feature extraction using DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X, Y)

# Print feature importances
print("Feature Importances:", model.feature_importances_)

Feature Importances: [0.058 0.318 0.1   0.026 0.042 0.215 0.132 0.108]
