In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
def forward_selection_chi2(X, y, max_features=5):
    selected_features = []
    remaining_features = list(X.columns)
    
    for i in range(max_features):
        best_feature = None
        best_score = -np.inf  # Store the highest Chi-Square score
        
        for feature in remaining_features:
            # this will check or add one feature at a time
            X_temp = X[selected_features + [feature]]
            # Perform Chi-Square test
            score, i = chi2(X_temp, y)
            # Average Chi-Square score for the feature
            feature_score = np.mean(score)  
            # Select the best scoring feature
            if feature_score > best_score:  
                best_score = feature_score
                best_feature = feature
        
        if best_feature:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
    #X_temp is just a temporary dataset that includes the currently tested feature in each iteration
    #will only contain the features for the last evaluated feature, not the final selected set. so we are returning X[selected_features]
    return X[selected_features], selected_features

In [3]:
def split_scalar(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

In [4]:
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    from sklearn.metrics import r2_score
    r2=r2_score(y_test,y_pred)
    return r2

In [5]:
def Linear(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LinearRegression
        regressor = LinearRegression()
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [6]:
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'linear')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [7]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVR
        regressor = SVR(kernel = 'rbf')
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [8]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training setC
        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2 

In [9]:
def random(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestRegressor
        regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
        regressor.fit(X_train, y_train)
        r2=r2_prediction(regressor,X_test,y_test)
        return  r2

In [10]:
def select_forward(acclin,accsvml,accsvmnl,accdes,accrf): 
    
    dataframe=pd.DataFrame(index=['Forward ChiSquare'],columns=['Linear','SVMl','SVMnl','Decision','Random'
                                                                                     ])

    for number,idex in enumerate(dataframe.index):
        
        dataframe['Linear'][idex]=acclin[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
    return dataframe

In [11]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True)

In [12]:
X=df2.drop('classification_yes', axis=1)
y=df2['classification_yes']

In [22]:
X_selected,selected_columns=forward_selection_chi2(X, y, max_features=7)
acclin=[]
accsvml=[]
accsvmnl=[]
accdes=[]
accrf=[]

In [23]:
X_train, X_test, y_train, y_test = split_scalar(X_selected, y)
for i in X_selected:   
    r2_lin=Linear(X_train,y_train,X_test)
    acclin.append(r2_lin)
        
    r2_sl=svm_linear(X_train,y_train,X_test)    
    accsvml.append(r2_sl)
        
    r2_NL=svm_NL(X_train,y_train,X_test)
    accsvmnl.append(r2_NL)
        
    r2_d=Decision(X_train,y_train,X_test)
    accdes.append(r2_d)
        
    r2_r=random(X_train,y_train,X_test)
    accrf.append(r2_r)
    
result=select_forward(acclin,accsvml,accsvmnl,accdes,accrf)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Linear'][idex]=acclin[number]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame o

In [21]:
#4
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
Forward ChiSquare,0.304963,0.256858,0.430795,0.479167,0.595486


In [15]:
#5
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
Forward ChiSquare,0.551985,0.545395,0.749654,0.782986,0.832899


In [18]:
#6
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
Forward ChiSquare,0.599041,0.586446,0.838962,0.869792,0.887587


In [24]:
#7
result

Unnamed: 0,Linear,SVMl,SVMnl,Decision,Random
Forward ChiSquare,0.657035,0.641906,0.893007,0.826389,0.901042
