In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [2]:
def selectkbest(indep_X,dep_Y,n):
        test = SelectKBest(score_func=chi2, k=n)
        fit1= test.fit(indep_X,dep_Y)     
        # summarize scores       
        selectk_features = fit1.transform(indep_X)
        # Get selected feature names (if X is a DataFrame)
        selected_features = indep_X.columns[test.get_support()].tolist()
        print(f"Top {n} Features: {selected_features}")
        return selectk_features

In [3]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test

In [4]:
def r2_prediction(regressor,X_test,y_test):
     y_pred = regressor.predict(X_test)
     from sklearn.metrics import r2_score
     r2=r2_score(y_test,y_pred)
     return r2

In [5]:
def Linear(X_train,y_train,X_test,y_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2   
    
def svm_linear(X_train,y_train,X_test,y_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2  
    
def svm_NL(X_train,y_train,X_test,y_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2  
     

def Decision(X_train,y_train,X_test,y_test):
        
        # Fitting K-NN to the Training setC
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2  
     

def random(X_train,y_train,X_test,y_test):       
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [6]:
def selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    # dataframe=pd.DataFrame(index=['ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'])

    #since we have only one row - could be simplified to direct assignments
    dataframe = pd.DataFrame({
    'Linear': [acclin[0]],
    'SVMl': [accsvml[0]],
    'SVMnl': [accsvmnl[0]],
    'Decision': [accdes[0]],
    'Random': [accrf[0]]
        }, index=['ChiSquare'])
    
    #below code is for multiple index

    # for number,idex in enumerate(dataframe.index):
        
    #     dataframe['Linear'][idex]=acclin[number]       
    #     dataframe['SVMl'][idex]=accsvml[number]
    #     dataframe['SVMnl'][idex]=accsvmnl[number]
    #     dataframe['Decision'][idex]=accdes[number]
    #     dataframe['Random'][idex]=accrf[number]
    return dataframe

In [7]:
# Main function
def main():
    dataset1=pd.read_csv("prep.csv",index_col=None)
    
    df2=dataset1
    
    df2 = pd.get_dummies(df2, drop_first=True)
    
    # indep_X=df2.drop('classification_yes', 1)
    dep_Y=df2['classification_yes']
    indep_X = df2.drop(columns=['classification_yes'])  
    
    
    kbest=selectkbest(indep_X,dep_Y,5)
    # print("KBEST:",kbest)
    
    acclin=[]
    accsvml=[]
    accsvmnl=[]
    accdes=[]
    accrf=[]
    
    X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)  
    for i in kbest:   
        r2_lin=Linear(X_train,y_train,X_test,y_test)
        acclin.append(r2_lin)
        
        r2_sl=svm_linear(X_train,y_train,X_test,y_test)    
        accsvml.append(r2_sl)
        
        r2_NL=svm_NL(X_train,y_train,X_test,y_test)
        accsvmnl.append(r2_NL)
        
        r2_d=Decision(X_train,y_train,X_test,y_test)
        accdes.append(r2_d)
        
        r2_r=random(X_train,y_train,X_test,y_test)
        accrf.append(r2_r)
        
        
    result=selectk_regression(acclin,accsvml,accsvmnl,accdes,accrf)
    print(result)
    
if __name__ == '__main__':
    main()

Top 5 Features: ['bgr', 'bu', 'sc', 'pcv', 'wc']
             Linear      SVMl     SVMnl  Decision    Random
ChiSquare  0.551985  0.545395  0.749654  0.696181  0.836806
