In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import cross_val_predict,cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import patsy
from IPython.display import HTML
import statsmodels

pd.options.display.max_columns = 150
_chars_per_line = 117

# Many thanks to lucacerone and harshil for their contribution to StackOverflow.
html_script = '''<script>code_show=true;function code_toggle(){if (code_show){
 $('div.input').hide();}else{$('div.input').show();}code_show = !code_show}
 $( document ).ready(code_toggle);</script><form action="javascript:code_toggle()">
 <input type="submit" value="Click here to toggle on/off the raw code."></form>'''

def code_toggle(): return HTML(html_script.replace('true','false'))

<center><img src="https://thetomatos.com/wp-content/uploads/2016/02/house-clipart-vector-graphics-house-eps-clip-art-vector.jpg" style=" margin: 15px; height: 120px"></center>


## Project 1:  Regression and classification with housing data

### Goals:

There are three goals this project seeks to achive from a technical perspective:

    1. To create a predictive model for the sale price of a property based on its "fixed" and "known" characteristics.
    2. To create a predictive model for sale price of a property as a function of its "alterable" qualities.
    3. To determine the features in the housing data that best predict "abnormal" sales.

Furthermore, in service to the reader (and ourselves) we seek to:

    I.   Present the data, methods and assumptions we are working with in a clear and explicit manner.
    II.  Justify the decicions we make towards the development of our model, and its evaluation.


In [2]:
class GlobalHelper:
    
    global immutable_replace
    def immutable_replace(entry,dictionary = {}):
        if entry in dictionary.keys():
            return dictionary[entry]
        else:
            return entry 
        
    global immutable_replace_with_unknowns
    def immutable_replace_with_unknowns(ser, dictionary, unknown = 'CouldNotBeInferred'):
        pass
        

    global scale_dataframe
    def scale_dataframe(df, method = 'StandardScaler'):
        if method == 'StandardScaler':
            scaler = StandardScaler()
        elif method == 'MinMaxScaler':
            scaler = MinMaxScaler()
        scaler.fit(df)
        return pd.DataFrame(columns=df.columns,data=scaler.transform(df))
    
    global plist
    def plist(list_to_print = [], spacing = 14, col_num = 8):
        '''Prints a list in a pretty way'''
        list_items = [str(item).ljust(spacing) for item in list_to_print]
        for i in range(0,len(list_to_print),col_num): 
            [print(list_items[j],end='') for j in range(i,i+col_num) if j < len(list_to_print)]
            print()

    global extended_type
    def extended_type(x):
        if x != x:
            return "<class 'missing'>"
        else:
            return str(type(x))

    global get_m_n
    def get_m_n(N):
        '''find m,n such that an m*n grid can hold N values in a pleasing way'''
        for m in range(int(np.ceil(np.sqrt(N))),N+1):
            for k in range(0,m):
                A = m**2 + m*k
                B = m/2 -1 + m**2 -k*m - m
                if B < N and A >= N: 
                    return (m,m-k)

In [3]:
class DataFrameHelper(pd.DataFrame):
    
    history = []
    
    def set_checkpoint(self):
        self.history.append('set_checkpoint')
        self.checkpoint = self
   
    def reset(self):
        try:
            self.history.append('reset')
            self = self.checkpoint
        except:
            self.history.append('reset failed')
            print('reseting failed')
            

    def explore_types(self, columns = None):      
        if columns == None: columns = self.columns
        
        di = {key:self[key].apply(extended_type).value_counts() for key in columns}
        return pd.DataFrame(columns = columns, data = di)
    
    def pcolumns(self,columns = None, spacing = 14, col_num = 8):
        '''Prints the column names in a pretty way. 
        The default is to print all columns, but a list may be passed.'''
        if columns == None: columns = self.columns
        plist(columns, spacing, col_num)
    
    def puniques(self,columns, spacing=7,col_num=16):
        for col in columns:
            print(col+':\n{}'.format('_'*115))
            plist(self[col].unique(), spacing=spacing,col_num=col_num)
            print()
    
    def plot_value_occurences(self,normed = False, columns = None, 
                              fig_size = (15,15), xmarks = False,mn = None):
        if columns == None: columns = self.columns.tolist()
            
        num_of_cols = len(columns)
        print('Generating ',num_of_cols,' plots')
        if mn == None:
            m,n = get_m_n(num_of_cols)
        else:
            m,n = mn
        
        plt.clf()
        plt.figure(figsize=fig_size)
        plt.suptitle('Histogram of unique values',fontsize=25)
        
        for i in range(num_of_cols):
            col = columns[i]
            plt.subplot(m,n,i+1)
            plt.title(col)
            self[col].value_counts(dropna = False).plot(kind='bar')
            if xmarks == False: plt.xticks([])
        plt.tight_layout()
        plt.subplots_adjust(top=0.92)
        plt.show()
        return None
    
    def plot_poly_regplot(self,target,deg = 1, columns = None, 
                              fig_size = (15,15), xmarks = False,mn = None,
                              low = False):
        if columns == None: columns = self.describe().columns.tolist()
            
        num_of_cols = len(columns)
        print('Generating ',num_of_cols,' plots')
        if mn == None:
            m,n = get_m_n(num_of_cols)
        else:
            m,n = mn
        
        plt.clf()
        plt.figure(figsize=fig_size)
        plt.suptitle('Polynomial fit of unique values',fontsize=25)
        
        for i in range(num_of_cols):
            try:
                col = columns[i]
                plt.subplot(m,n,i+1)
                plt.title(col)
                sns.regplot(self[col],self[target], order = deg, lowess= low)
                if xmarks == False: plt.xticks([])
            except:
                print('Something went wrong while exploring {}'.format(columns[i]))
                pass
        plt.tight_layout()
        plt.subplots_adjust(top=0.92)
        plt.show()
        return None
    
    def plot_types(self, columns = None, split = 4, figsize = (17,2)):
        plt.clf()
        self.explore_types(columns = columns).T.plot(kind='bar',figsize=figsize, stacked=True, legend=None)
        plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=2, mode="expand", borderaxespad=0.)
        plt.suptitle('Occurence of types in attributes',fontsize=25)
        plt.subplots_adjust(top=0.80)
        plt.show()
        return None
        

In [4]:
class ProjectHelper(DataFrameHelper):
    '''This Class is hard coded for this project. Its methods are useful for data cleaning.'''
    
    #definition of essentialy nessesary data cleaning
    def impute_typos(self):
        COPY = self.copy()
        COPY.history.append('impute_typos')
        COPY.loc[948,'BsmtExposure'] = 'No'
        COPY.loc[332,'BsmtFinType2'] = 'Unf'
        COPY.loc[1379,'Electrical'] = 'SBrkr'
        return COPY
    
    def impute_correct_type_to_MSSubClass(self):
        self.history.append('impute_correct_type_to_MSSubClass')
        
        MSSubClass_dict ={
            20:'1-STORY 1946 & NEWER ALL STYLES',
            30:'1-STORY 1945 & OLDER',
            40:'1-STORY W/FINISHED ATTIC ALL AGES',
            45:'1-1/2 STORY - UNFINISHED ALL AGES',
            50:'1-1/2 STORY FINISHED ALL AGES',
            60:'2-STORY 1946 & NEWER',
            70:'2-STORY 1945 & OLDER',
            75:'2-1/2 STORY ALL AGES',
            80:'SPLIT OR MULTI-LEVEL',
            85:'SPLIT FOYER',
            90:'DUPLEX - ALL STYLES AND AGES',
            120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
            150:'1-1/2 STORY PUD - ALL AGES',
            160:'2-STORY PUD - 1946 & NEWER',
            180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
            190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'}
        
        self['MSSubClass'] = self['MSSubClass'].apply(lambda x: immutable_replace(x,MSSubClass_dict))

    def impute_0_to_nan(self):
        '''We impute 0 to nan for every occurence in some columns (because 0 is a suitable value of an area)'''
        self.history.append('impute_0_to_nan')
        
        for col in ['LotFrontage','MasVnrArea']:
            self[col].fillna(value = 0,inplace=True)
    
    def impute_None_to_nan(self):
        '''We impute the string 'None' to nan (because no such feature is present)'''
        
        self.history.append('impute_None_to_nan')
        for col in ['Alley','MasVnrType','BsmtFinType2','GarageType','PoolQC','Fence','MiscFeature']:
            self[col].fillna(value = 'None',inplace=True)
    
    def impute_NotApplicable_to_nan(self):
        '''We impute 'NotApplicable' to nan (because this is an atribute of a feature that is not present)'''
        self.history.append('impute_NotApplicable_to_nan')
        
        for col in ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','FireplaceQu','GarageYrBlt',
                    'GarageFinish','GarageQual','GarageCond']:
            self[col].fillna(value = 'NotApplicable',inplace=True)
        
        
  
    # Perform the above imputation methods one after another.
    def impute(self):
        self.history.append('impute...')
        self.impute_typos()
        self.impute_correct_type()
        self.impute_0_to_nan()
        self.impute_None_to_nan()
        self.impute_NotApplicable_to_nan()
    
    #definition of 'optional' data cleaning
    def cast_quality_to_numbers(self,NA = 0):
        self.history.append('cast_quality_to_numbers')
        quality_to_number_dict = {
           'Ex':5,
           'Gd':4,
           'TA':3,
           'Fa':2,
           'Po':1,
           'NA':NA,
           'NotApplicable': NA}
        
        for col in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                    'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
            self[col] = self[col].apply(lambda x: immutable_replace(x,quality_to_number_dict))
    
    def cast_land_slope_to_numbers(self):
        self.history.append('cast_land_slope_to_numbers')
        LandSlope_dict = {
           'Gtl':0, #Gentle slope
           'Mod':1, #Moderate Slope
           'Sev':2} #Severe Slope
        self['LandSlope'] = self['LandSlope'].apply(lambda x: immutable_replace(x,LandSlope_dict))       
    
    #definition of methods for quality of life improvements.
    def get_years(self,LIST = [2010]):
        if not isinstance(LIST,list): LIST = [LIST]
        mask = [True if x in LIST else False for x in self['YrSold']]
        return self[mask]
    

In [5]:
class ModelingReport:
    def __init__(self,model,X,y,naive_score,cv_score,true_score):
        self.model = model
        self.X = X
        self.y = y 
        self.naive_score = naive_score
        self.cv_score = cv_score
        self.true_score = true_score
        self.stored_settings = None
    
    def to_df(self):
        rec = pd.DataFrame(columns=['report', 'model','naive_score','true_score','cv_score'],
                           data={'report':self,'model':self.model,'naive_score':self.naive_score,
                                 'cv_score':self.cv_score,'true_score':self.true_score},index = ['rec:'])
        
        if self.stored_settings != None:
            settings = pd.DataFrame(self.stored_settings,index=['rec:'])
            rec = pd.concat([rec,settings],axis=1)
        return rec
    
    def remember_config(self,di):
        self.stored_settings = di

In [6]:
def implement_modeling(
    df,
    target = 'SalePrice',
    ignore_colums = None,
    consider_columns = None,
    
    model='lm',
    model_alpha = 1,
    
    get_dummies = True, 
    dummy_columns = None,
    do_drop_first = False,
    

    scale_technique = None):
    
    '''This function performs most actions that would normaly be associated to implementing 
    and assessing a regression model.
    
    it returns an object called model_report that contains 
    the key elements of the implementation and assesment as atributes.
    
    The model_report contains as atributes the model and feature matrix 
    so that further exploration can be done with ease'''
    
    try:
        test = (df['YrSold'] == 2010).values
        train = (df['YrSold'] != 2010).values
    except:
        pass
    
    X = df.drop([target],axis=1)
    y = df[target]
    
    # using specific colums or ignoring others
    if consider_columns != None:
        X = X[consider_columns]
    if ignore_colums != None:
        X = X.drop([col for col in ignore_colums if col in X.columns],axis = 1)
    
    # Select a model and initialize it with the appropiate hyperparameters
    if model == 'lm':
        model = LinearRegression()
    elif model == 'ridge':
        model = Ridge(alpha = model_alpha)
    elif model == 'lasso':
        model = Lasso(alpha = model_alpha)

    # Get dummie variables
    # This will dummify columns if you specify them, or dummify all non numeric columns if you do not
    
    if get_dummies == True or dummy_columns != None:
        if dummy_columns == None: dummy_columns = [col for col in X.columns if col not in X.describe().columns]
        X = pd.get_dummies(data = X,columns = dummy_columns, drop_first=do_drop_first)
        X.columns = [col.replace(' ','_') for col in X.columns]
    else:
        if dummy_columns == None: dummy_columns = [col for col in X.columns if col not in X.describe().columns]
        X = X.drop(dummy_columns,axis=1)

    # Perform scaling
    
    if scale_technique != None:
        X = scaler(X,scale_technique)
        
    ''' # model fitting and evaluation: 
    model.fit(X,y)
    
    naive_score = model.score(X,y)
    cv_score = r2_score(y,cross_val_predict(model,X,y))
    
    model.fit(X[train],y[train])
    true_score = model.score(X[test],y[test])
    
    gs = 
    
    model_report = ModelingReport(model,X,y,naive_score,cv_score,true_score)
    
    model_report.remember_config({'alpha':model_alpha,'scaling':scale_technique,
                                  'get_dummies':get_dummies,'features':X.shape[1]})
    
    return model_report'''

    return (X,y)

In [7]:
def getXy(
    df,
    target = 'SalePrice',
    ignore_colums = None,
    consider_columns = None,
    
    drop_col_if_contains_nan = True,
    
    cast_into_other = [],
    
    get_dummies = True, 
    dummy_columns = None,
    do_drop_first = False,
    
    scale_technique = None):
    
    '''This function performs most actions that would normaly be associated to preparing 
    a frature and target matrix for procesing'''
    
    try:
        test = (df['YrSold'] == 2010).values
        train = (df['YrSold'] != 2010).values
    except:
        pass
    
    X = df.drop([target],axis=1)
    y = df[target]
    
    # using specific colums or ignoring others
    if consider_columns != None:
        X = X[consider_columns]
    if ignore_colums != None:
        X = X.drop([col for col in ignore_colums if col in X.columns],axis = 1)

    # drop columns or rows when a a missing value is found.
    if drop_col_if_contains_nan:
        X = X.dropna(how = 'any',axis = 1)
    else:
        X = X.dropna(how = 'any',axis = 0)
        
    # Perform a map from old values to new values on specific columns.
    for col,di in cast_into_other:
        X[col] = X[col].apply(lambda x: immutable_replace(x,di))

    # Get dummie variables: This will dummify columns if you specify them, 
    # or dummify all non numeric columns if you do not specify any.
    
    if get_dummies == True or dummy_columns != None:
        if dummy_columns == None: dummy_columns = [col for col in X.columns if col not in X.describe().columns]
        X = pd.get_dummies(data = X,columns = dummy_columns, drop_first=do_drop_first)
        X.columns = [col.replace(' ','_') for col in X.columns]
    else:
        if dummy_columns == None: dummy_columns = [col for col in X.columns if col not in X.describe().columns]
        X = X.drop(dummy_columns,axis=1)

    # Perform scaling
    if scale_technique != None:
        X = dataframe_scaler(X,scale_technique)

    return (X,y)

<center><img src="data-science-workflow.jpg" style=" margin: 15px; height: 120px"></center>

### 1. Parsing the data
#### 1.1 Introducing the data

In this project we will be working with the Ames housing data set that consists of records detailing 1460 sales of various properties from 2006 to 2010 in the city of Ames, Iowa. Of these, 1450 records pertain to the sale of residential properties.

<a href="https://git.generalassemb.ly/raw/DSI-LDN-3/lessons-repo/master/projects/project-02/data_description.txt?token=AAAY4q6W44WqbDLYg_WJiPApoQi-l2qrks5ZeROlwA%3D%3D" target="_blank">Feature Descriptions</a>

In [None]:
data_set = pd.read_csv('./housing.csv')
data_set.drop(['Id'],axis=1,inplace=True)

residential = data_set['MSZoning'].isin(['FV','RH','RL','RM'])

df = DataFrameHelper(data_set[residential])

The 80 features associated with a sale are:

In [None]:
df.pcolumns()

Soon we will clasify these properties as either:

    1. Fixed         (it would not be fesable to alter this attributes)
    2. Alterable     (it would be possible to alter this attributes)
    3. Incidental    (it is a property of the specific sale)
    4. Derivative    (it can be deduced to a high degree of accuracy from other properties)
    
But first we inspect our data for missing values and deal with them in a case-by-case basis.

#### 1.2. Verifying the quality of the data

##### 1.2.1 Dealing with missing values

We seek to identify and replace our missing values.

In [None]:
df.plot_types()

Our first concern is to handle the attributes with missing values. 

These are:

In [None]:
examine_closer = [col for col in df.columns if df[col].isnull().any()]
plist(examine_closer)

First, we concentrate on attributes with missing values where some of the present etries are strings.

We look at their unique values and the frequency of their incidence.

In [None]:
examine_closer_str = [col for col in examine_closer if df[col].apply(lambda x: isinstance(x,str)).any()]
examine_closer_num = [col for col in examine_closer if col not in examine_closer_str]
df.plot_value_occurences(columns = examine_closer_str,xmarks = True)

We notice many of these missing values corespond to the 'NA' value in the .csv file used to indicate that a feature is not pressent. We replace these with the string 'DescribesMissing'. We do this for 2 reasons:

    1. For many of these features we will end up using dummy variables, which do not 
       handle np.nan well, but can easily incoporate one more category. 
    
    2. By enforcing concistency we hope we can later inspect these values further 
       perhaps as part of some feature egeneering.
    
The features for which we replace nan with 'DescriptorMissing' are: 

    BsmtQual      BsmtCond      BsmtExposure  BsmtFinType1  BsmtFinType2      FireplaceQu   
    GarageType    GarageFinish  GarageQual    GarageCond    PoolQC        

However, for:

    Alley         Fence         MiscFeature

we replace missing values with the string 'None' following the convention used in the describing the type of facade (MasVnrType). In describing the type of covering of the walls, they allow the abscense of any covering to be a valid covering kind in and of itself.



Regarding MasVnrType itself:

we could replace nan with 'None', but this would amount to assuming data input errors. Rather than mofing forward with this assumption, we explore another possibility:

In [None]:
df[df['MasVnrType'].isnull()][['MasVnrType','SaleCondition','YearRemodAdd','YrSold']]

As we can see, 3 of the 8 properties with nan MasVnrType were only partialy built. It would be wrong to assume any one kind of verneer will be used (or intentionaly left with no veneer). Our plan is then to replace nan with "CouldNotBeInferred" in the understanding that at the time of creating dummie variables, records with "CouldNotBeInferred" will given values between 0 and 1 in the pertinent dummie variables such that they add up to 1 and correspond to the corresponding ratio of houses with each property. By doing this the model can benefit from the information in other features of the record without us imposing assumptions. 

We could apply the same reasoning to Electrical, where we the missing value corresponds a property we belive is undergoing remodelations. 

In [None]:
df[df['Electrical'].isnull()][['Electrical','CentralAir','SaleCondition','YearBuilt','YearRemodAdd','YrSold']]

However we choose to side with a 'nearest-neighbors-style' heuristic, and impute a value of "SBrkr" to this nan value.

As a justification note that property 1379 was built in 2006, and 100% of properties built between 1965 and 2010 have used "SBrkr" as their Electrical system.

In [None]:
df[np.abs(df['YearBuilt'] - 2007) < 42]['Electrical'].value_counts(dropna= False)

In [None]:
descriptors = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu',
               'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC']

df.fillna({col:'DescribesMissing' for col in descriptors},inplace = True)

df.fillna({'Alley':'None','Fence':'None','MiscFeature':'None'},inplace = True)

df.fillna({'MasVnrType':'CouldNotBeInferred'},inplace = True)

df.fillna({'Electrical':'SBrkr'},inplace = True)

_ = None

It remains for us to deal with the missing entries in columns that are mostly numeric, these are:

In [None]:
plist([c for c in examine_closer if c not in examine_closer_str])

We interpret a missing value in LotFrontage and MasVnrArea to represent a length and area of 0 respectively. However, no such easy way of dealing with GarageYrBlt is available. One option would be to drop the 37 records for which there is a missing value in GarageYrBlt, but we don't wish to omit the properties without a garage from our model since we belive the precence or absence of a garage may be very significant. 

Moreover we have pleanty of information regarding the garage of the properties that have them. And a garage area of 0 is a surefire way to deduce the absence of a garage, and hece of a GarageYrBlt value.

We will soon consider dropping the feature from our analysis, but for the time being, we replace nan with "DescribesMissing".

In [None]:
df.fillna({'LotFrontage':0,'MasVnrArea':0},inplace=True)

df.fillna({'GarageYrBlt':'DescribesMissing'},inplace = True)

df['GarageYrBlt'] = df['GarageYrBlt'].apply(lambda x: str(x))

_ = None

Lastly, we note that the MSSubClass attribute is encoded as integers. However, it is clearly meant to be a category. We replace the integer codes with their coresponding category expressed as a string. This gives us the a DataFrame with no missing values moving forward.

In [None]:
MSSubClass_dict ={
            20:'1-STORY 1946 & NEWER ALL STYLES',
            30:'1-STORY 1945 & OLDER',
            40:'1-STORY W/FINISHED ATTIC ALL AGES',
            45:'1-1/2 STORY - UNFINISHED ALL AGES',
            50:'1-1/2 STORY FINISHED ALL AGES',
            60:'2-STORY 1946 & NEWER',
            70:'2-STORY 1945 & OLDER',
            75:'2-1/2 STORY ALL AGES',
            80:'SPLIT OR MULTI-LEVEL',
            85:'SPLIT FOYER',
            90:'DUPLEX - ALL STYLES AND AGES',
            120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
            150:'1-1/2 STORY PUD - ALL AGES',
            160:'2-STORY PUD - 1946 & NEWER',
            180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',
            190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'}

df['MSSubClass'] = df['MSSubClass'].apply(lambda x: immutable_replace(x,MSSubClass_dict))

### Section 2: Feature selection.

Let us take a quick look at our current feature matrix, and the individual power of attributes in predicting our sale price.

In [None]:
X,y = getXy(df)

df_fp = DataFrameHelper(pd.concat([X,y],axis = 1))



In [None]:
kkl = df_fp.corr().applymap(lambda x:1 if x > 0.5 else 0)



In [None]:
tf = [col for col in kkl.columns if kkl[col].sum() == 3]
kkn = kkl[tf].loc[tf,:]
sns.heatmap(kkn)
plt.show()

In [None]:
np.polyfit(df['MoSold'],df['SalePrice'],1,full=True)
np.polyfit?

#### 2.1 Idenfiying superfluous features.

Before we begin developing a model for sale price, we take a moment to explore the 'redundancy' in our data. As we shall see, several attributes of a sale are merely linear combinations of other attributes. We can identify these situations for numeric features by by setting $x_i$ as the target of a plain linerar regression using features 

$$x_1,...x_{i-1},-\square-,x_{i+1},...,x_{n}$$

to predict the values of $x_i$. We should then look out for an $R^2$ value of 1.

In [None]:
numerics = df.describe().columns.tolist()

def naive_lm_predict(attributes,
                     ignore =None,
                    verbose = 0):
    coef_storage = {}
    results = []
    if verbose > 1: print('Predicting:\n{}'.format('_'*_chars_per_line))
    
    for col in attributes:
        if verbose > 1: print(col, end='; ')
        X,y = getXy(df,target=col,ignore_colums=ignore)
        lm = LinearRegression()
        lm.fit(X,y)
        score = lm.score(X,y) 
        coef_storage[col] = pd.DataFrame({'atr':X.columns, 'coef':lm.coef_})
        results.append(col.ljust(14)+':'+str(round(score,4)))

    if verbose > 1: print('\n\nResults:\n{}'.format('_'*_chars_per_line))
    if verbose > 0: plist(results,col_num=4,spacing=24)
    return (results, coef_storage)

#_ = naive_lm_predict(numerics[1:3])

Reflecting on the results above guides much of what follows. So let us Transcribe them for ease of reference:

    LotFrontage   :0.4574   LotArea       :0.7219   OverallQual   :0.8318   OverallCond   :0.6436   
    YearBuilt     :0.9443   YearRemodAdd  :0.7318   MasVnrArea    :0.6882   BsmtFinSF1    :1.0      
    BsmtFinSF2    :1.0      BsmtUnfSF     :1.0      TotalBsmtSF   :1.0      1stFlrSF      :1.0      
    2ndFlrSF      :1.0      LowQualFinSF  :1.0      GrLivArea     :1.0      BsmtFullBath  :0.6737   
    BsmtHalfBath  :0.333    FullBath      :0.7651   HalfBath      :0.6898   BedroomAbvGr  :0.7376   
    KitchenAbvGr  :0.8127   TotRmsAbvGrd  :0.859    Fireplaces    :0.8728   GarageCars    :0.8807   
    GarageArea    :0.8714   WoodDeckSF    :0.3592   OpenPorchSF   :0.4338   EnclosedPorch :0.4121   
    3SsnPorch     :0.188    ScreenPorch   :0.2904   PoolArea      :0.9959   MiscVal       :0.966    
    MoSold        :0.2123   YrSold        :0.2577   SalePrice     :0.9335   




And incidentaly we do indeed confirm that, for each of the 1450 properties:

$$\text{TotalBsmtSF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF}$$

$$\text{GrLivArea = 1stFlrSF + 2ndFlrSF + LowQualFinSF}$$

In [None]:
def naive_logreg_predict(attributes, myC = 10**10, ignore= None):
    coef_storage = {}
    results = []
    print('Predicting:\n{}'.format('_'*_chars_per_line))
    for col in attributes:
        print(col, end='; ')
        X,y = getXy(df,target=col, ignore_colums= ignore)
        logreg = LogisticRegression(C = myC)
        logreg.fit(X,y)
        score = logreg.score(X,y) 
        results.append(col.ljust(14)+':'+str(round(score,4)))
    print('\n\nResults:\n{}'.format('_'*_chars_per_line))
    plist(results,col_num=4,spacing=24)
    return (results, coef_storage)

categoric = [col for col in df.columns if col not in numerics]

#_ = naive_logreg_predict(categoric)

The corresponding output is:

    Predicting:
    __________________________________________________________________________________________________________
    MSSubClass; MSZoning; Street; Alley; LotShape; LandContour; Utilities; LotConfig; LandSlope; Neighborhood;
    Condition1; Condition2; BldgType; HouseStyle; RoofStyle; RoofMatl; Exterior1st; Exterior2nd; MasVnrType;
    ExterQual; ExterCond; Foundation; BsmtQual; BsmtCond; BsmtExposure; BsmtFinType1; BsmtFinType2; Heating;
    HeatingQC; CentralAir; Electrical; KitchenQual; Functional; FireplaceQu; GarageType; GarageYrBlt; GarageFinish;
    GarageQual; GarageCond; PavedDrive; PoolQC; Fence; MiscFeature; SaleType; SaleCondition; 

    Results:
    _____________________________________________________________________________________________
    MSSubClass    :0.8159   MSZoning      :0.8759   Street        :1.0      Alley         :0.9393   
    LotShape      :0.7131   LandContour   :0.9062   Utilities     :1.0      LotConfig     :0.7234   
    LandSlope     :0.9531   Neighborhood  :0.4966   Condition1    :0.8621   Condition2    :0.9938   
    BldgType      :0.9034   HouseStyle    :0.8628   RoofStyle     :0.8076   RoofMatl      :0.9848   
    Exterior1st   :0.5124   Exterior2nd   :0.4834   MasVnrType    :0.8793   ExterQual     :0.8807   
    ExterCond     :0.8821   Foundation    :0.8028   BsmtQual      :0.8269   BsmtCond      :0.9241   
    BsmtExposure  :0.7021   BsmtFinType1  :0.6731   BsmtFinType2  :0.9379   Heating       :0.9834   
    HeatingQC     :0.6248   CentralAir    :0.9538   Electrical    :0.9152   KitchenQual   :0.7855   
    Functional    :0.9317   FireplaceQu   :0.6207   GarageType    :0.7972   GarageYrBlt   :0.3021   
    GarageFinish  :0.6634   GarageQual    :0.9559   GarageCond    :0.9641   PavedDrive    :0.9276   
    PoolQC        :1.0      Fence         :0.8097   MiscFeature   :0.9972   SaleType      :0.891    
    SaleCondition :0.8421   


In [None]:
ress = {}
for col in X.columns:
     ress[col] = naive_lm_predict(['SalePrice'], ignore = [col])[1]

ress

In [None]:
X,y = implement_modeling(df, target='TotalBsmtSF',ignore_colums=decide_to_ignore)

lm.fit(X,y)

pd.DataFrame({'atr':X.columns,'coef':lm.coef_})

In [None]:
categorical = [col for col in df.columns if col not in df.describe().columns and col not in incidental]
df.plot_value_occurences(columns=categorical, mn = (7,6))

In [None]:
paramgrid = {'alpha':np.logspace(-4,4,3)}

ridge = Ridge()
gs = GridSearchCV(ridge, 
                  param_grid=paramgrid,
                  cv = 2, 
                  n_jobs=3, 
                  verbose=4)

A,B = implement_modeling(df, ignore_colums=decide_to_ignore)

gs.fit(A,B)



In [None]:
ridge.fit(A,B)

Further inspection reveals that all but one of the properties has access to all public 

In [None]:
kfixed = ['MSSubClass','MSZoning','LotFrontage','LotArea','Street','Alley','LotShape','LandContour','Utilities',
          'LandSlope','LotConfig','Neighborhood','Condition1','Condition2','YearBuilt','YearRemodAdd','RoofStyle',
          'MasVnrType','MasVnrArea','Foundation','BsmtQual','BsmtExposure','BsmtFinSF1','BsmtFinSF2','Heating',
          '2ndFlrSF','KitchenAbvGr','Fireplaces','GarageType','GarageCars','GarageArea']

kalterable = ['RoofMatl','Exterior1st','Exterior2nd','ExterQual','ExterCond','BsmtCond','BsmtFinType1',
              'BsmtFinType2','BsmtUnfSF','HeatingQC','CentralAir','Electrical','1stFlrSF','GrLivArea',
              'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','Bedroom','KitchenQual','TotRmsAbvGrd',
              'Functional','FireplaceQu','GarageFinish','GarageQual','GarageCond','PavedDrive','WoodDeckSF',
              'OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','PoolQC','Fence',
              'MiscFeature','MiscVal']

    
kderivative = ['BldgType','HouseStyle','OverallQual','OverallCond','LowQualFinSF','TotalBsmtSF']

kincidental = ['MoSold','YrSold','SaleType','SaleCondition','SalePrice']

len(kalterable + kfixed +kderivative +kincidental)

In [None]:

kincidental == incidental

[f for f in kfixed if f not in fixed]

In [None]:
fixed = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
         'LandContour',  'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
         'Condition2', 'BldgType', 'HouseStyle', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
         'Foundation', 'BsmtExposure', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
         'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
         'TotRmsAbvGrd', 'Fireplaces', 'GarageType', 'GarageCars', 'GarageArea']

incidental = ['MoSold','YrSold','SaleType','SaleCondition','SalePrice']

alterable = [col for col in df.columns if col not in fixed and col not in incidental]

print('\nFixed:\n{}'.format('_'*_chars_per_line))
plist(fixed)
print('\nAlterable:\n{}'.format('_'*_chars_per_line))
plist(alterable)
print('\nIncidental:\n{}'.format('_'*_chars_per_line))
plist(incidental)

However, we do not consider it possible to alter the value of an attribute when it relates to an aspect of the house that is not present. As such, we frown on the idea of improving the PoolQC in a house where PoolArea is 0 and the like.

At this time we have not implemented a check in our code that prevents a user from speculating, and the linear model we implement will not distinguish between valid and nonsensical speculation. User caution is adviced.

This very much pushes us to examine attributes with missing values before moving forwards.

When possible we NaN with 0, such as when describing the length of pavement 

We establish a convention: 

    1. We use 'NotPresent' to indicate the physical absence of a feature.
        eg. The Alley attribute that states the type of alley leading up to the house will have:
           Grvl          When it is a gravel road
           Pave          When it is a paved road
           NotPresent    When there is no alley leading up to the house
        
    2. We use 'NotApplicable' to indicate a metaphysical absence, such as the quality of a missing feature.
        eg. The quality of a pool when the house has no pool


In [None]:
df.plot_value_occurences(columns=df.columns.tolist()[0:17])

In [None]:
df_data_cleaning.set_checkpoint()

In [None]:
for col in [col for col in df.columns if types_df[col]["<class 'str'>"] > 0]:
    print(col+':')
    plist(df[col].unique(), spacing=10,rstart = '> ')
    print()

In [None]:

    
class DataExplorer(DataFrameHelper):

    def explore_correlation(self,):
        sns.regplot(self[cols],y,order=2)

        

In [None]:
df1 = DataExplorer(df)

In [None]:
test = (df['YrSold'] == 2010).values
train = (df['YrSold'] != 2010).values

In [None]:
dat = pd.DataFrame()
for st in [None,'S','M']:
    for it in [True,False]:
        dat = pd.concat([dat,implement_modeling(df,model='lasso',
                                               consider_columns = fixed,
                                               get_dummies=True,
                                               scale_technique=st,
                                               internal_norm = it).to_df()])


dat

In [None]:
implement_modeling(df,model='ridge',scale_technique='M',
                   consider_columns = fixed,
                   get_dummies=True).to_df()

In [None]:
implement_modeling(df,model='ridge',scale_technique=None,
                   consider_columns = fixed,
                   get_dummies=True).to_df()

In [None]:
implement_modeling(df,model='ridge',scale_technique=None,
                   consider_columns = fixed,
                   get_dummies=True,
                   internal_norm = True).to_df()

In [None]:
parameter_space = {model:['lm','ridge','lasso'],
                    {alpha : [0.1,1,10,100],
        for consider_col in [fixed]:
            for scale_technique in [None,'S','M']:
                for ddf in [True,False]:

def hyper_parameter_explorer(di)
    for 

In [None]:
dat = pd.DataFrame()
for mod in ['ridge']:
    for alpha in [0.1,1,10,100]:
        for consider_col in [fixed]:
            for scale_technique in [None,'S','M']:
                for ddf in [True,False]:
                    
                    key = (mod.rjust(7)
                       +' alpha='+str(round(alpha,3)).rjust(4)
                       +' Scaled by '+str(scale_technique).rjust(5)
                       +' Dropping frist: '+str(ddf).rjust(5))
                    print (key)
                    
                    report = implement_modeling(df1,model=mod,consider_columns=consider_col,
                                                scale_technique = scale_technique,model_alpha=alpha,
                                                do_drop_first = ddf)
                    
                    dat = pd.concat([dat, report.to_df()],axis=0)

In [None]:
dat.reset_index().sort_values('cv_score')

In [None]:
sort = sorted([(di[key].true_score.round(4),key) for key in di.keys()])
for (x,y) in sort: print(y,x)

In [None]:
for i in range(100):
    k=np.sqrt(i)
    print (i,np.floor(k+1) * np.floor(k))

(0.85340352664490027, 'ridge10None')

0.85426008217280969 ridge10None after!!!

(0.87057883511318945, 'ridge10None')]


    (0.87057883511318945, 'ridge10NoneTrue'),
    (0.87113126001225183, 'ridge10NoneFalse')]


    (0.87057883511318945, 'ridge10NoneTrue'),
     (0.87113126001225183, 'ridge10NoneFalse') worked better than alpha = 100


before altering the qual to num:

        (0.8671087710751757, 'ridge alpha=100 Scaled by S Dropping frist: False'),
     (0.87057883511318945, 'ridge alpha=10 Scaled by None Dropping frist: True'),
     (0.87113126001225183, 'ridge alpha=10 Scaled by None Dropping frist: False')]
     
after 

     (0.8671087710751757, 'ridge alpha=100 Scaled by S Dropping frist: False'),
     (0.87057883511318945, 'ridge alpha=10 Scaled by None Dropping frist: True'),
     (0.87113126001225183, 'ridge alpha=10 Scaled by None Dropping frist: False')]


In [None]:
datatat = pd.read_csv('../project-02/housing.csv')


In [None]:
pd.DataFrame(datatat.dtypes.value_counts(dropna=False))

In [None]:
pd.DataFrame(datatat.isnull().sum())

In [None]:
pd.DataFrame(datatat.applymap(type)).unique().T

In [None]:
{k:{key:datatat[key].apply(type).value_counts(dropnan=False) for key in datatat.columns} for }

In [None]:
pd.DataFrame({key:datatat[key].apply(extended_type).value_counts(dropna = False) for key in datatat.columns})

https://stackoverflow.com/questions/27934885/how-to-hide-code-from-cells-in-ipython-notebook-visualized-with-nbviewer

In [None]:
df.explore_types().ipynb_checkpoints/

In [None]:
df[df['BsmtFinSF1'] > df['1stFlrSF']]

In [None]:
df.loc[535,'BsmtQual':'GrLivArea']

In [None]:
print(type(8))

In [None]:
df.loc[535,'BsmtQual':]

In [None]:
df[df['MSSubClass'] == 190].apply(lambda x: x['BsmtFinSF1'] > )

In [None]:
type(df.explore_types().iloc[0,0])

In [None]:
import gc

def objects_by_id(id_):
    for obj in gc.get_objects():
        if id(obj) == id_:
            return obj
    raise Exception("No found")
    
X = objects_by_id(0x10a5a07b8)

test = (df['YrSold'] == 2010).values
train = (df['YrSold'] != 2010).values

In [None]:
incidental<sup>[1](#myfootnote1)</sup>.
<a name="myfootnote1">1</a>: This is a footnote

In [None]:
len(df.MiscVal.unique())

In [None]:
(df.MiscFeature.value_counts(dropna=False))

In [None]:
np.nan*9

In [None]:

for col in examine_closer:
    if col not in examine_closer_str:
        print(col+':\n{}'.format('_'*115))
        plist(df[col].unique(), spacing=7,col_num=16)
        print()

In [None]:

    BsmtFinSF1
    BsmtFinSF2
    BsmtUnfSF
    TotalBsmtSF
    1stFlrSF
    2ndFlrSF
    LowQualFinSF
    GrLivArea