In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, f_classif, RFE, f_regression

from sklearn.linear_model import LogisticRegression, LinearRegression

**DISPLAYS BAR PLOT OF VALUE COUNTS**

In [2]:
def counts_plot(x, y):
    
    value_counts = df[x].value_counts()
    values = df[x].value_counts().index
    plt.figure(figsize = (12,9))
    sns.barplot(x = values, y = value_counts.values, palette = y)
    plt.xlabel('Values')
    plt.ylabel('Count')
    plt.title('Value Counts of column : ' + x)
    plt.xticks(rotation = 90)  
    #rotates the xticks by 90degrees for better readability
    
    for i, count in enumerate(value_counts.values):
        plt.text(i, count, str(count), ha = 'center', va = 'bottom',style = 'italic', fontsize = 10)

    plt.tight_layout() 
    plt.show()
    
   
    
    #plots a bar plot for value counts of values in a column and also displays the value on top of the bar plot...

**REMOVE THE DOLLAR SIGN IN CURRENCY COLUMNS**

In [3]:
def dollar_remover(x):
    
    x = x.split("$")[-1]
    
    return float(x)

**REMOVES THE 'Hr' TEXT FROM TIME COLUMNS**

In [4]:
def hour_remover(x):
    
    if x in ["No Time Limit","other", "26 Hrs", "10 Hrs"]:
        return float(9)
    
    elif x == "30 min":
        return float(0.5)
    
    x = x.split("Hr")[0]
    
    return float(x)

In [5]:
def hour_remover2(x):
    
    if x in ["No Time Limit","other", "26 Hrs", "10 Hrs"]:
        return float(4)
    
    elif x == "30 min":
        return float(0.5)
    
    x = x.split("Hr")[0]
    
    return float(x)

**METERHEAD GROUPER**

In [6]:
def meterhead_group(x):
    
    if x == "Twin":
        
        return x
    
    return "Not_Twin"

**CITY GROUPER**

In [7]:
def city_group(x):
    
    if x in ["Fairview", "Kitsilano", "West Point Grey", "Arbutus Ridge", "Kerrisdale", 
             "South Cambie", "Shaughnessy"]:
        
        return "Lower_mainland"
    
   
    elif x in ["Downtown", "West End"]:
        
        return "Downtown"
    
    
    return "East_Vancouver"


#groups the cities according to specified area...

**DATA SPLITTER**

In [8]:
def data_split(df, x):
    
    features = df.drop(x, axis = 1)
    
    target = df[x]
    
    return features

In [9]:
def data_split2(df, x):
    
    features = df.drop(x, axis = 1)
    
    target = df[x]
    
    return target

## FEATURE SELECTIONS

**VARIANCE THRESHOLD**

In [10]:
def select_variance(features, threshold_value):
    df = features
    
    selectVarianceThreshold = VarianceThreshold(threshold = threshold_value)
    
    selectVarianceThreshold.fit(df)
    
    df = df.iloc[:,selectVarianceThreshold.get_support()] 
    
    return df

**SELECT FROM MODEL**

In [11]:
def select_from_model(features, target):
    
    select_from_model = SelectFromModel(estimator = LogisticRegression(max_iter = 5000)).fit(features, target)
    
    features = select_from_model.transform(features)
    
    return features

In [12]:
def select_from_model2(features, target):
    
    select_from_model = SelectFromModel(estimator = LinearRegression()).fit(features, target)
    
    features = select_from_model.transform(features)
    
    return features

**RECURSIVE FEATURE ELIMINATION**

In [13]:
def recursive_fe(features, target, n):
    rfe = RFE(estimator = LogisticRegression(max_iter=5000), n_features_to_select = n, step = 1)
    rfe_result = rfe.fit(features, target)


    features = features.loc[:, rfe_result.support_]
    
    return features

In [14]:
def recursive_fe2(features, target, n):
    rfe = RFE(estimator = LinearRegression(), n_features_to_select = n, step = 1)
    rfe_result = rfe.fit(features, target)


    features = features.loc[:, rfe_result.support_]
    
    return features

**SELECT K BEST WITH f_classif**

In [15]:
def select_best(features, target, n):

    selectKBest = SelectKBest(score_func = f_classif, k = n)
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 42)
    
    selectKBest.fit_transform(x_train, y_train)
    
    features = features.iloc[:,selectKBest.get_support()]
    
    return features

In [16]:
def select_best2(features, target, n):

    selectKBest = SelectKBest(score_func = f_regression, k = n)
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 42)
    
    selectKBest.fit_transform(x_train, y_train)
    
    features = features.iloc[:,selectKBest.get_support()]
    
    return features