## Feature Selection Functions 

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

Correlation Based Selection

In [2]:
def select_manual(data, threshold):
    df = data
    
    corr = pd.DataFrame(df.corr()['log_inst_review'].abs().sort_values(ascending = False) > threshold)
    corr = corr[corr['log_inst_review'] == True]
    
    df = df.loc[:,(corr).index]
    
    df.drop('log_inst_review', axis = 1, inplace = True)
    
    return df

Variance Threshold Selection

In [3]:
def select_variance(data, threshold_value):
    df = data
    
    selectVarianceThreshold = VarianceThreshold(threshold = threshold_value)
    
    selectVarianceThreshold.fit(df)
    
    print(pd.DataFrame({'Variance': selectVarianceThreshold.variances_, 'Selection_status': selectVarianceThreshold.get_support()}, index = df.columns))
    
    df = df.iloc[:,selectVarianceThreshold.get_support()] 
    
    return df

Select K-Best Method

In [4]:
def select_best(data, num_of_features):
    df = data
    
    target = df.log_inst_review
    features = df.drop('log_inst_review', axis = 1)

    selectKBest = SelectKBest(score_func = f_regression, k = num_of_features)
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 42)
    
    selectKBest.fit_transform(x_train, y_train)
    
    df = features.iloc[:,selectKBest.get_support()]
    
    return df

Polynomial Features

In [5]:
def make_poly(data):
    
    df = data
    
    poly_features = PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)
    
    df = poly_features.fit_transform(df)
    
    return df

#transforms the data using polynomial trasnformation method...

Scaling Function

In [6]:
def data_scaling(data):
    
    scaler = StandardScaler()
    
    data = scaler.fit_transform(data)
    
    return data

#scales the data
#Another known method is MinMaxScaler()

Regression Model 

In [7]:
def regression_call(features, target):
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 42)
    #splits dataset for training and testing (75:25)
    
    x_train = data_scaling(x_train)
    
    x_test = data_scaling(x_test)
    #we apply data scaling separately to test and train dataset 
    #To avoid leakage of information for unbiased model results
    
    machineLearningModel = LinearRegression()
    
    machineLearningModel.fit(x_train, y_train)
    
    y_pred = machineLearningModel.predict(x_test)
    
    rSquared = machineLearningModel.score(x_test, y_test)
    
    rootMeanSquaredError = mean_squared_error(y_test, y_pred, squared = False)
    
    return rSquared, rootMeanSquaredError
   
#we run the regression model and return the R-Squared and Root Mean Squared error values...